diff options
Diffstat (limited to 'src/core/CL/cl_kernels/reduction_operation.cl')
-rw-r--r-- | src/core/CL/cl_kernels/reduction_operation.cl | 196 |
1 files changed, 191 insertions, 5 deletions
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl index aa7403b52b..c1be4472a7 100644 --- a/src/core/CL/cl_kernels/reduction_operation.cl +++ b/src/core/CL/cl_kernels/reduction_operation.cl @@ -61,13 +61,14 @@ inline DATA_TYPE sum(__global const DATA_TYPE *input) return (in.s0 + in.s1); } -/** This kernel performs reduction given an operation. +/** This kernel performs parallel reduction given an operation on x-axis. * * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float - * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32 * @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum + * @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value + * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) @@ -81,7 +82,7 @@ inline DATA_TYPE sum(__global const DATA_TYPE *input) * @param[in] partial_sum_offset_first_element_in_bytes The offset of the first element in the source tensor * @param[in] local_sums Local buffer for storing the partial sum */ -__kernel void reduction_operation( +__kernel void reduction_operation_x( IMAGE_DECLARATION(src), IMAGE_DECLARATION(partial_sum), __local DATA_TYPE *local_sums) @@ -109,7 +110,192 @@ __kernel void reduction_operation( if(lid == 0) { +#if defined(MEAN) && defined(WIDTH) + if(y == get_local_size(1) - 1) + { + local_sums[0] /= WIDTH; + } +#endif /* defined(MEAN) && defined(WIDTH) */ ((__global DATA_TYPE *)offset(&partial_sum, get_group_id(0), y))[0] = local_sums[0]; } } -}
\ No newline at end of file +} + +#if defined(WIDTH) +/** This kernel performs reduction on x-axis. (QASYMM8) + * + * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt + * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor + */ +__kernel void reduction_operation_quantized_x( + VECTOR_DECLARATION(src), + VECTOR_DECLARATION(output)) +{ + Vector src = CONVERT_TO_VECTOR_STRUCT(src); + Vector output = CONVERT_TO_VECTOR_STRUCT(output); + + uint res = 0; + + for(unsigned int x = 0; x < WIDTH; ++x) + { + res += *((__global uchar *)vector_offset(&src, x)); + } + +#if defined(MEAN) + res /= WIDTH; +#endif /* defined(MEAN) */ + + // Store result + *((__global uchar *)output.ptr) = convert_uchar(res); +} +#endif /* defined(HEIGHT) */ + +#if defined(HEIGHT) +/** This kernel performs reduction on y-axis. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt + * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor + */ +__kernel void reduction_operation_y( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(output)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image output = CONVERT_TO_IMAGE_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16) + res = 0; + + for(unsigned int y = 0; y < HEIGHT; ++y) + { + res += CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)); + } + +#if defined(MEAN) + res /= HEIGHT; +#endif /* defined(MEAN) */ + + // Store result + vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr); +} +#endif /* defined(HEIGHT) */ + +#if defined(DEPTH) +/** This kernel performs reduction on z-axis. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptt + * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor + */ +__kernel void reduction_operation_z( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16) + res = 0; + + for(unsigned int z = 0; z < DEPTH; ++z) + { + res += CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)); + } + +#if defined(MEAN) + res /= DEPTH; +#endif /* defined(MEAN) */ + + // Store result + vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr); +} +#endif /* defined(DEPTH) */ + +#if defined(BATCH) && defined(DEPTH) +/** This kernel performs reduction on w-axis. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128 + * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptt + * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the output tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor + */ +__kernel void reduction_operation_w( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output)) +{ + Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH); + Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH); + + VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16) + res = 0; + + for(unsigned int w = 0; w < BATCH; ++w) + { + res += CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)); + } + +#if defined(MEAN) + res /= BATCH; +#endif /* defined(MEAN) */ + + // Store result + vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr); +} +#endif /* defined(BATCH) && defined(DEPTH) */
\ No newline at end of file |