From a23b4686a091a7960a4b336d0fe53f15db4ae538 Mon Sep 17 00:00:00 2001 From: Jakub Sujak Date: Thu, 5 Oct 2023 10:20:59 +0100 Subject: Optimize CLTranspose operator * Transpose higher dimensional tensors (>2D) by collapsing higher dimensions into the third dimension thus avoiding multiple dispatches of the CL kernel * Maximize tile size without register spilling Resolves: COMPMID-6448 Change-Id: Iac094b8c428bdf319d9c28a8334cb55d58e2d14b Signed-off-by: Jakub Sujak Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10443 Tested-by: Arm Jenkins Reviewed-by: Viet-Hoa Do Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- src/gpu/cl/kernels/ClTransposeKernel.cpp | 46 ++++++++++++++++++++++++-------- src/gpu/cl/kernels/ClTransposeKernel.h | 10 +++---- 2 files changed, 40 insertions(+), 16 deletions(-) (limited to 'src/gpu') diff --git a/src/gpu/cl/kernels/ClTransposeKernel.cpp b/src/gpu/cl/kernels/ClTransposeKernel.cpp index 6eb2bf81c0..f95a215107 100644 --- a/src/gpu/cl/kernels/ClTransposeKernel.cpp +++ b/src/gpu/cl/kernels/ClTransposeKernel.cpp @@ -58,14 +58,37 @@ void ClTransposeKernel::configure(const CLCompileContext &compile_context, const const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src); auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); + // Explicitly set the tensor shape to preserve dimensions + dst->set_tensor_shape(dst_shape); + ARM_COMPUTE_ERROR_THROW_ON(ClTransposeKernel::validate(src, dst)); auto padding_info = get_padding_info({src, dst}); - // Create kernel - const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0)); - const int vec_size_x_leftovers = src->dimension(0) % vec_size_x; - const unsigned int vec_size_y = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(1)); - const int vec_size_y_leftovers = src->dimension(1) % vec_size_y; + unsigned int vec_size_x; + unsigned int vec_size_y; + + // Set the optimal tile size for each data type without register spilling + switch (src->element_size()) + { + case 1: + vec_size_x = adjust_vec_size(8, src->dimension(0)); + vec_size_y = adjust_vec_size(16, src->dimension(1)); + break; + case 2: + vec_size_x = adjust_vec_size(8, src->dimension(0)); + vec_size_y = adjust_vec_size(8, src->dimension(1)); + break; + case 4: + vec_size_x = adjust_vec_size(4, src->dimension(0)); + vec_size_y = adjust_vec_size(8, src->dimension(1)); + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type"); + break; + } + + const int vec_size_x_leftovers = src->dimension(0) % vec_size_x; + const int vec_size_y_leftovers = src->dimension(1) % vec_size_y; CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE_IN_BYTES=" + support::cpp11::to_string(src->element_size())); @@ -78,7 +101,7 @@ void ClTransposeKernel::configure(const CLCompileContext &compile_context, const // Configure kernel window Window win = calculate_max_window(*src, Steps(vec_size_x, vec_size_y)); - ICLKernel::configure_internal(win, cl::NDRange(2, 8)); + ICLKernel::configure_internal(win); ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } @@ -87,7 +110,6 @@ Status ClTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *ds ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 2, "Transpose up to 2-D src tensor is supported"); // Validate configured dst if (dst->total_size() != 0) @@ -112,15 +134,17 @@ void ClTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cl::C utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - Window slice = window.first_slice_window_2D(); + // Collapse dimensions higher than width and height into the batch dimension + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); do { unsigned int idx = 0; - add_2D_tensor_argument(idx, src, slice); - add_2D_tensor_argument(idx, dst, slice); + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } while (window.slide_window_slice_2D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClTransposeKernel.h b/src/gpu/cl/kernels/ClTransposeKernel.h index b30d6f0281..eaad38b20f 100644 --- a/src/gpu/cl/kernels/ClTransposeKernel.h +++ b/src/gpu/cl/kernels/ClTransposeKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H -#define ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H +#ifndef ACL_SRC_GPU_CL_KERNELS_CLTRANSPOSEKERNEL_H +#define ACL_SRC_GPU_CL_KERNELS_CLTRANSPOSEKERNEL_H #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" @@ -34,7 +34,7 @@ namespace opencl { namespace kernels { -/** OpenCL kernel to transpose a 2D tensor. */ +/** OpenCL kernel to transpose a tensor. Only the first two dimensions (width, height) are transposed. */ class ClTransposeKernel : public IClKernel { public: @@ -61,4 +61,4 @@ public: } // namespace kernels } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H */ +#endif // ACL_SRC_GPU_CL_KERNELS_CLTRANSPOSEKERNEL_H -- cgit v1.2.1