aboutsummaryrefslogtreecommitdiff
path: root/src/core
diff options
context:
space:
mode:
authorshubham <shub98.gupta@samsung.com>2019-01-07 21:37:55 +0530
committerGeorgios Pinitas <georgios.pinitas@arm.com>2019-01-08 16:27:21 +0000
commite1a4e37ac266324de366401f2bda459b011a4869 (patch)
tree60881f6ce7af53f0286a1313f1998d10643051eb /src/core
parentd33fe343103edbea4fb1cf6121c49caef36ff379 (diff)
downloadComputeLibrary-e1a4e37ac266324de366401f2bda459b011a4869.tar.gz
Implementation of Permute CL kernel to handle all permutations
This patch will add a generic permute cl-kernel to handle all permutations available for tensors having rank upto 4. Change-Id: I50eb555d9d45d5ad5f7fa9b0a3862dd17551d458 Signed-off-by: shubham <shub98.gupta@samsung.com> Reviewed-on: https://review.mlplatform.org/449 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Manuel Bottini <manuel.bottini@arm.com> Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
Diffstat (limited to 'src/core')
-rw-r--r--src/core/CL/CLKernelLibrary.cpp4
-rw-r--r--src/core/CL/cl_kernels/permute.cl92
-rw-r--r--src/core/CL/kernels/CLPermuteKernel.cpp50
3 files changed, 40 insertions, 106 deletions
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 2bc2d06827..905a34a509 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -368,9 +368,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "NV21_to_RGBA8888_bt709", "color_convert.cl" },
{ "NV21_to_YUV444_bt709", "color_convert.cl" },
{ "output_stage_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl" },
- { "permute_201", "permute.cl" },
- { "permute_120", "permute.cl" },
- { "permute_3201", "permute.cl" },
+ { "permute", "permute.cl" },
{ "pixelwise_mul_float", "pixelwise_mul_float.cl" },
{ "pixelwise_mul_int", "pixelwise_mul_int.cl" },
{ "pixelwise_mul_quantized", "pixelwise_mul_int.cl" },
diff --git a/src/core/CL/cl_kernels/permute.cl b/src/core/CL/cl_kernels/permute.cl
index 03fc15e4e8..77f03f7d5b 100644
--- a/src/core/CL/cl_kernels/permute.cl
+++ b/src/core/CL/cl_kernels/permute.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,11 +23,12 @@
*/
#include "helpers.h"
-#if defined(DATA_TYPE) && defined(DEPTH_IN)
-/** Perform a DCHW -> DHWC permute operation on an input tensor.
+#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
+/**Perform a permute operation on an input tensor of Shape DCHW.
*
* @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
* @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2, -DP2=1, -DP3=0 and -DP4=3.
*
* @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
@@ -48,81 +49,26 @@
* @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
*/
-__kernel void permute_201(
- TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output))
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+__kernel void permute(TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output))
- *((__global DATA_TYPE *)tensor4D_offset(&out, (get_global_id(2) % DEPTH_IN), get_global_id(0), get_global_id(1), (get_global_id(2) / DEPTH_IN))) = *((__global DATA_TYPE *)in.ptr);
-}
-
-/** Perform a DCHW -> DWCH permute operation on an input tensor.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
- *
- * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void permute_120(
- TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output))
{
Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
- *((__global DATA_TYPE *)tensor4D_offset(&out, get_global_id(1), (get_global_id(2) % DEPTH_IN), get_global_id(0), (get_global_id(2) / DEPTH_IN))) = *((__global DATA_TYPE *)in.ptr);
-}
+ int out_index[4] = { 0 };
+ int in_index[4] = { 0 };
-/** Perform a DCHW -> HWCD permute operation on an input tensor.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
- *
- * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void permute_3201(
- TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output))
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+ in_index[0] = get_global_id(0); // W
+ in_index[1] = get_global_id(1); // H
+ in_index[2] = get_global_id(2) % DEPTH_IN; // C
+ in_index[3] = get_global_id(2) / DEPTH_IN; // B
+
+ out_index[0] = in_index[P1];
+ out_index[1] = in_index[P2];
+ out_index[2] = in_index[P3];
+ out_index[3] = in_index[P4];
- *((__global DATA_TYPE *)tensor4D_offset(&out, (get_global_id(2) / DEPTH_IN), (get_global_id(2) % DEPTH_IN), get_global_id(0), get_global_id(1))) = *((__global DATA_TYPE *)in.ptr);
+ *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], out_index[3])) = *((__global DATA_TYPE *)in.ptr);
}
-#endif // defined(DATA_TYPE) && defined(DEPTH_IN)
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
diff --git a/src/core/CL/kernels/CLPermuteKernel.cpp b/src/core/CL/kernels/CLPermuteKernel.cpp
index a9a2c5c97a..6c44199f7a 100644
--- a/src/core/CL/kernels/CLPermuteKernel.cpp
+++ b/src/core/CL/kernels/CLPermuteKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -56,16 +56,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
DataType::U16, DataType::S16,
DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((perm != PermutationVector{ 2U, 0U, 1U })
- && (perm != PermutationVector{ 1U, 2U, 0U })
- && (perm != PermutationVector{ 3U, 2U, 0U, 1U }),
- "Only [2, 0, 1], [1, 2, 0] and [3, 2, 0, 1] permutation is supported");
- const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 1 || input->num_dimensions() > 4,
+ "Permutation upto 4-D input tensor is supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() < 1 || perm.num_dimensions() > 4,
+ "Permutation vector size should be less than or equal to 4");
+ for(const auto &p : perm)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(p >= perm.num_dimensions(), "Permutation vector has invalid values");
+ }
// Validate configured output
if(output->total_size() != 0)
{
+ const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -87,30 +91,16 @@ void CLPermuteKernel::configure(const ICLTensor *input, ICLTensor *output, const
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
// Create kernel
- std::set<std::string> build_opts;
-
- build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
- build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
-
- // Run [2, 0, 1] permute
- if(_perm == PermutationVector{ 2U, 0U, 1U })
- {
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute_201", build_opts));
- }
- // Run [1, 2, 0] permute
- else if(_perm == PermutationVector{ 1U, 2U, 0U })
- {
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute_120", build_opts));
- }
- // Run [3, 2, 0, 1] permute
- else if(_perm == PermutationVector{ 3U, 2U, 0U, 1U })
- {
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute_3201", build_opts));
- }
- else
- {
- ARM_COMPUTE_ERROR("Not supported.");
- }
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+ // New positions of width(W), height(H), channel(C) and batch(D) based on permutation vector
+ build_opts.add_option("-DP1=" + support::cpp11::to_string((_perm.num_dimensions() >= 1) ? perm[0] : 0));
+ build_opts.add_option("-DP2=" + support::cpp11::to_string((_perm.num_dimensions() >= 2) ? perm[1] : 1));
+ build_opts.add_option("-DP3=" + support::cpp11::to_string((_perm.num_dimensions() >= 3) ? perm[2] : 2));
+ build_opts.add_option("-DP4=" + support::cpp11::to_string((_perm.num_dimensions() >= 4) ? perm[3] : 3));
+
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute", build_opts.options()));
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps());