diff options
author | shubham <shub98.gupta@samsung.com> | 2019-01-07 21:37:55 +0530 |
---|---|---|
committer | Georgios Pinitas <georgios.pinitas@arm.com> | 2019-01-08 16:27:21 +0000 |
commit | e1a4e37ac266324de366401f2bda459b011a4869 (patch) | |
tree | 60881f6ce7af53f0286a1313f1998d10643051eb /src/core/CL/cl_kernels/permute.cl | |
parent | d33fe343103edbea4fb1cf6121c49caef36ff379 (diff) | |
download | ComputeLibrary-e1a4e37ac266324de366401f2bda459b011a4869.tar.gz |
Implementation of Permute CL kernel to handle all permutations
This patch will add a generic permute cl-kernel to handle
all permutations available for tensors having rank upto 4.
Change-Id: I50eb555d9d45d5ad5f7fa9b0a3862dd17551d458
Signed-off-by: shubham <shub98.gupta@samsung.com>
Reviewed-on: https://review.mlplatform.org/449
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels/permute.cl')
-rw-r--r-- | src/core/CL/cl_kernels/permute.cl | 92 |
1 files changed, 19 insertions, 73 deletions
diff --git a/src/core/CL/cl_kernels/permute.cl b/src/core/CL/cl_kernels/permute.cl index 03fc15e4e8..77f03f7d5b 100644 --- a/src/core/CL/cl_kernels/permute.cl +++ b/src/core/CL/cl_kernels/permute.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -23,11 +23,12 @@ */ #include "helpers.h" -#if defined(DATA_TYPE) && defined(DEPTH_IN) -/** Perform a DCHW -> DHWC permute operation on an input tensor. +#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4) +/**Perform a permute operation on an input tensor of Shape DCHW. * * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 + * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2, -DP2=1, -DP3=0 and -DP4=3. * * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) @@ -48,81 +49,26 @@ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image */ -__kernel void permute_201( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output)) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); +__kernel void permute(TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output)) - *((__global DATA_TYPE *)tensor4D_offset(&out, (get_global_id(2) % DEPTH_IN), get_global_id(0), get_global_id(1), (get_global_id(2) / DEPTH_IN))) = *((__global DATA_TYPE *)in.ptr); -} - -/** Perform a DCHW -> DWCH permute operation on an input tensor. - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void permute_120( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output)) { Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN); Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); - *((__global DATA_TYPE *)tensor4D_offset(&out, get_global_id(1), (get_global_id(2) % DEPTH_IN), get_global_id(0), (get_global_id(2) / DEPTH_IN))) = *((__global DATA_TYPE *)in.ptr); -} + int out_index[4] = { 0 }; + int in_index[4] = { 0 }; -/** Perform a DCHW -> HWCD permute operation on an input tensor. - * - * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void permute_3201( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output)) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + in_index[0] = get_global_id(0); // W + in_index[1] = get_global_id(1); // H + in_index[2] = get_global_id(2) % DEPTH_IN; // C + in_index[3] = get_global_id(2) / DEPTH_IN; // B + + out_index[0] = in_index[P1]; + out_index[1] = in_index[P2]; + out_index[2] = in_index[P3]; + out_index[3] = in_index[P4]; - *((__global DATA_TYPE *)tensor4D_offset(&out, (get_global_id(2) / DEPTH_IN), (get_global_id(2) % DEPTH_IN), get_global_id(0), get_global_id(1))) = *((__global DATA_TYPE *)in.ptr); + *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], out_index[3])) = *((__global DATA_TYPE *)in.ptr); } -#endif // defined(DATA_TYPE) && defined(DEPTH_IN) +#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4) |