aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels
diff options
context:
space:
mode:
authorViet-Hoa Do <viet-hoa.do@arm.com>2023-10-09 10:58:35 +0100
committerViet-Hoa Do <viet-hoa.do@arm.com>2023-10-11 10:01:49 +0000
commitc210c85548c7f627690ed9259622d3ab342fe612 (patch)
tree6385edb5083a805bac8ddd83567a1e1dac0715ce /src/core/CL/cl_kernels
parentfb9c25d27791d934300581596cce7c5875a79a80 (diff)
downloadComputeLibrary-c210c85548c7f627690ed9259622d3ab342fe612.tar.gz
Optimize CL reduction operation
* Batch dimension is added to reduction operation. - All the dimensions higher than the batch dimension are collapsed so that the input and output tensors are always 3-4D. - CL kernel is called once instead of being repeatedly called to process each sliding window. Resolves: COMPMID-6443 Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com> Change-Id: Icd99939d52d3bb648f08537e5f52ef27e894061b Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10456 Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels')
-rw-r--r--src/core/CL/cl_kernels/common/reduction_operation.cl92
1 files changed, 49 insertions, 43 deletions
diff --git a/src/core/CL/cl_kernels/common/reduction_operation.cl b/src/core/CL/cl_kernels/common/reduction_operation.cl
index 1cb6664078..99369be19a 100644
--- a/src/core/CL/cl_kernels/common/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/common/reduction_operation.cl
@@ -186,27 +186,28 @@ __kernel void reduction_operation_non_parallel_x(
* @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
*
* @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
*/
__kernel void reduction_operation_y(
- IMAGE_DECLARATION(input),
- IMAGE_DECLARATION(output))
+ __global uchar *input_ptr,
+ uint input_stride_y,
+ uint input_stride_z,
+ uint input_offset_first_element_in_bytes,
+
+ __global uchar *output_ptr,
+ uint output_stride_z,
+ uint output_offset_first_element_in_bytes)
{
int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
- int y = get_global_id(1);
+ int z = get_global_id(1);
- __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y;
- __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y;
+ __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + z * input_stride_z;
+ __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + z * output_stride_z;
VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
@@ -275,32 +276,33 @@ __kernel void reduction_operation_y(
* @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
*
* @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the output tensor in W dimension (in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
*/
__kernel void reduction_operation_z(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output))
+ __global uchar *input_ptr,
+ uint input_stride_y,
+ uint input_stride_z,
+ uint input_stride_w,
+ uint input_offset_first_element_in_bytes,
+
+ __global uchar *output_ptr,
+ uint output_stride_y,
+ uint output_stride_w,
+ uint output_offset_first_element_in_bytes)
{
int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
int y = get_global_id(1);
- int z = get_global_id(2);
+ int w = get_global_id(2);
- __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + z * input_stride_z;
- __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + z * output_stride_z;
+ __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + w * input_stride_w;
+ __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + w * output_stride_w;
VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
@@ -369,39 +371,43 @@ __kernel void reduction_operation_z(
*
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
- * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
+ * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
*
* @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_stride_v Stride of the source tensor in V dimension (in bytes)
* @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the output tensor in W dimension (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_stride_v Stride of the output tensor in V dimension (in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
*/
__kernel void reduction_operation_w(
- TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output))
+ __global uchar *input_ptr,
+ uint input_stride_y,
+ uint input_stride_z,
+ uint input_stride_w,
+ uint input_stride_v,
+ uint input_offset_first_element_in_bytes,
+
+ __global uchar *output_ptr,
+ uint output_stride_y,
+ uint output_stride_z,
+ uint output_stride_v,
+ uint output_offset_first_element_in_bytes)
{
int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
int y = get_global_id(1);
- int z = get_global_id(2);
- __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + (z % DEPTH) * input_stride_z + (z / DEPTH) * input_stride_w;
- __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + (z % DEPTH) * output_stride_z + (z / DEPTH) * output_stride_z;
+ int gid_2 = get_global_id(2);
+ int z = get_global_id(2) % DEPTH;
+ int v = get_global_id(2) / DEPTH;
+
+ __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + z * input_stride_z + v * input_stride_v;
+ __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + z * output_stride_z + v * output_stride_v;
VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));