From a23b4686a091a7960a4b336d0fe53f15db4ae538 Mon Sep 17 00:00:00 2001 From: Jakub Sujak Date: Thu, 5 Oct 2023 10:20:59 +0100 Subject: Optimize CLTranspose operator * Transpose higher dimensional tensors (>2D) by collapsing higher dimensions into the third dimension thus avoiding multiple dispatches of the CL kernel * Maximize tile size without register spilling Resolves: COMPMID-6448 Change-Id: Iac094b8c428bdf319d9c28a8334cb55d58e2d14b Signed-off-by: Jakub Sujak Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10443 Tested-by: Arm Jenkins Reviewed-by: Viet-Hoa Do Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- docs/user_guide/release_version_and_change_log.dox | 1 + src/core/CL/cl_kernels/common/transpose.cl | 17 +++++--- src/cpu/kernels/CpuTransposeKernel.cpp | 5 ++- src/gpu/cl/kernels/ClTransposeKernel.cpp | 46 ++++++++++++++++------ src/gpu/cl/kernels/ClTransposeKernel.h | 10 ++--- tests/validation/CL/Transpose.cpp | 25 +++++++++++- tests/validation/fixtures/TransposeFixture.h | 10 ++--- 7 files changed, 84 insertions(+), 30 deletions(-) diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox index 3e04837c1e..882244d2f2 100644 --- a/docs/user_guide/release_version_and_change_log.dox +++ b/docs/user_guide/release_version_and_change_log.dox @@ -54,6 +54,7 @@ v23.11 Public major release - Remove legacy PostOps interface. PostOps was the experimental interface for kernel fusion and is replaced by the new Dynamic Fusion interface. - Performance optimizations: - Optimize @ref cpu::CpuReshape + - Optimize @ref opencl::ClTranspose - Add new OpenCLâ„¢ kernels: - @ref opencl::kernels::ClMatMulLowpNativeMMULKernel support for QASYMM8 and QASYMM8_SIGNED, with batch support - Deprecate support for Bfloat16 in @ref cpu::CpuCast. diff --git a/src/core/CL/cl_kernels/common/transpose.cl b/src/core/CL/cl_kernels/common/transpose.cl index 82db2908b5..5b4c68ca10 100644 --- a/src/core/CL/cl_kernels/common/transpose.cl +++ b/src/core/CL/cl_kernels/common/transpose.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -124,23 +124,28 @@ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source matrix in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as src_ptr * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination matrix in Z dimension (in bytes) + * @param[in] dst_step_z dst_gx_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix */ -__kernel void transpose(IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) +__kernel void transpose(TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) { uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0); uint y_offs = max((int)(get_global_id(1) * VEC_SIZE_Y - (VEC_SIZE_Y - VEC_SIZE_LEFTOVER_Y) % VEC_SIZE_Y), 0); + uint z_offs = get_global_id(2); // Compute addresses - __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * DATA_TYPE_IN_BYTES + y_offs * src_stride_y; - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + y_offs * DATA_TYPE_IN_BYTES + x_offs * dst_stride_y; + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * DATA_TYPE_IN_BYTES + y_offs * src_stride_y + z_offs * src_stride_z; + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + y_offs * DATA_TYPE_IN_BYTES + x_offs * dst_stride_y + z_offs * dst_stride_z; // Load the NxM block at (x, y) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) @@ -237,4 +242,4 @@ __kernel void transpose(IMAGE_DECLARATION(src), VEC_SIZE_LEFTOVER_Y != 0 && get_global_id(1) == 0); } -#endif // defined(DATA_TYPE_IN_BYTES) && defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X) && defined(VEC_SIZE_Y) && defined(VEC_SIZE_LEFTOVER_Y) \ No newline at end of file +#endif // defined(DATA_TYPE_IN_BYTES) && defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X) && defined(VEC_SIZE_Y) && defined(VEC_SIZE_LEFTOVER_Y) diff --git a/src/cpu/kernels/CpuTransposeKernel.cpp b/src/cpu/kernels/CpuTransposeKernel.cpp index 615bc6ce1e..0f762ba041 100644 --- a/src/cpu/kernels/CpuTransposeKernel.cpp +++ b/src/cpu/kernels/CpuTransposeKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -737,6 +737,9 @@ void CpuTransposeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src); auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); + // Explicitly set the tensor shape to preserve dimensions + dst->set_tensor_shape(dst_shape); + // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst)); diff --git a/src/gpu/cl/kernels/ClTransposeKernel.cpp b/src/gpu/cl/kernels/ClTransposeKernel.cpp index 6eb2bf81c0..f95a215107 100644 --- a/src/gpu/cl/kernels/ClTransposeKernel.cpp +++ b/src/gpu/cl/kernels/ClTransposeKernel.cpp @@ -58,14 +58,37 @@ void ClTransposeKernel::configure(const CLCompileContext &compile_context, const const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src); auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); + // Explicitly set the tensor shape to preserve dimensions + dst->set_tensor_shape(dst_shape); + ARM_COMPUTE_ERROR_THROW_ON(ClTransposeKernel::validate(src, dst)); auto padding_info = get_padding_info({src, dst}); - // Create kernel - const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0)); - const int vec_size_x_leftovers = src->dimension(0) % vec_size_x; - const unsigned int vec_size_y = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(1)); - const int vec_size_y_leftovers = src->dimension(1) % vec_size_y; + unsigned int vec_size_x; + unsigned int vec_size_y; + + // Set the optimal tile size for each data type without register spilling + switch (src->element_size()) + { + case 1: + vec_size_x = adjust_vec_size(8, src->dimension(0)); + vec_size_y = adjust_vec_size(16, src->dimension(1)); + break; + case 2: + vec_size_x = adjust_vec_size(8, src->dimension(0)); + vec_size_y = adjust_vec_size(8, src->dimension(1)); + break; + case 4: + vec_size_x = adjust_vec_size(4, src->dimension(0)); + vec_size_y = adjust_vec_size(8, src->dimension(1)); + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type"); + break; + } + + const int vec_size_x_leftovers = src->dimension(0) % vec_size_x; + const int vec_size_y_leftovers = src->dimension(1) % vec_size_y; CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE_IN_BYTES=" + support::cpp11::to_string(src->element_size())); @@ -78,7 +101,7 @@ void ClTransposeKernel::configure(const CLCompileContext &compile_context, const // Configure kernel window Window win = calculate_max_window(*src, Steps(vec_size_x, vec_size_y)); - ICLKernel::configure_internal(win, cl::NDRange(2, 8)); + ICLKernel::configure_internal(win); ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } @@ -87,7 +110,6 @@ Status ClTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *ds ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 2, "Transpose up to 2-D src tensor is supported"); // Validate configured dst if (dst->total_size() != 0) @@ -112,15 +134,17 @@ void ClTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cl::C utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - Window slice = window.first_slice_window_2D(); + // Collapse dimensions higher than width and height into the batch dimension + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); do { unsigned int idx = 0; - add_2D_tensor_argument(idx, src, slice); - add_2D_tensor_argument(idx, dst, slice); + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } while (window.slide_window_slice_2D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClTransposeKernel.h b/src/gpu/cl/kernels/ClTransposeKernel.h index b30d6f0281..eaad38b20f 100644 --- a/src/gpu/cl/kernels/ClTransposeKernel.h +++ b/src/gpu/cl/kernels/ClTransposeKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H -#define ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H +#ifndef ACL_SRC_GPU_CL_KERNELS_CLTRANSPOSEKERNEL_H +#define ACL_SRC_GPU_CL_KERNELS_CLTRANSPOSEKERNEL_H #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" @@ -34,7 +34,7 @@ namespace opencl { namespace kernels { -/** OpenCL kernel to transpose a 2D tensor. */ +/** OpenCL kernel to transpose a tensor. Only the first two dimensions (width, height) are transposed. */ class ClTransposeKernel : public IClKernel { public: @@ -61,4 +61,4 @@ public: } // namespace kernels } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H */ +#endif // ACL_SRC_GPU_CL_KERNELS_CLTRANSPOSEKERNEL_H diff --git a/tests/validation/CL/Transpose.cpp b/tests/validation/CL/Transpose.cpp index 943534058b..6cf5fe8537 100644 --- a/tests/validation/CL/Transpose.cpp +++ b/tests/validation/CL/Transpose.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -50,12 +50,14 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip( framework::dataset::make("InputInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::U16), // Invalid shape TensorInfo(TensorShape(20U, 13U), 1, DataType::U8), // Wrong data type TensorInfo(TensorShape(20U, 16U), 1, DataType::U32), // Valid + TensorInfo(TensorShape(20U, 16U, 3U, 3U), 1, DataType::U16), // Transpose only first two dimensions }), framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(13U, 20U), 1, DataType::U32), TensorInfo(TensorShape(31U, 20U), 1, DataType::U16), TensorInfo(TensorShape(16U, 20U), 1, DataType::U32), + TensorInfo(TensorShape(16U, 20U, 3U, 3U), 1, DataType::U16), })), - framework::dataset::make("Expected", { false, false, true })), + framework::dataset::make("Expected", { false, false, true, true })), a_info, output_info, expected) { // Lock tensors @@ -80,6 +82,16 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLTransposeFixture, framework::Dataset // Validate output validate(CLAccessor(_target), _reference); } +FIXTURE_DATA_TEST_CASE(RunLargeHighDimensional, + CLTransposeFixture, + framework::DatasetMode::NIGHTLY, + combine(concat(concat(datasets::Large3DShapes(), datasets::Large4DShapes()), + datasets::Large5dShapes()), + framework::dataset::make("DataType", DataType::U8))) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} TEST_SUITE_END() // U8 TEST_SUITE(U16) @@ -106,6 +118,15 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLTransposeFixture, framework::Datase // Validate output validate(CLAccessor(_target), _reference); } +FIXTURE_DATA_TEST_CASE(RunSmallHighDimensional, + CLTransposeFixture, + framework::DatasetMode::PRECOMMIT, + combine(concat(datasets::Small3DShapes(), datasets::Small4DShapes()), + framework::dataset::make("DataType", DataType::U32))) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} FIXTURE_DATA_TEST_CASE(RunLarge, CLTransposeFixture, framework::DatasetMode::NIGHTLY, combine(concat(datasets::Large1DShapes(), datasets::Large2DShapes()), framework::dataset::make("DataType", DataType::U32))) { diff --git a/tests/validation/fixtures/TransposeFixture.h b/tests/validation/fixtures/TransposeFixture.h index 92eb9af0c1..212c76cc9a 100644 --- a/tests/validation/fixtures/TransposeFixture.h +++ b/tests/validation/fixtures/TransposeFixture.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_TEST_TRANSPOSE_FIXTURE -#define ARM_COMPUTE_TEST_TRANSPOSE_FIXTURE +#ifndef ACL_TESTS_VALIDATION_FIXTURES_TRANSPOSEFIXTURE_H +#define ACL_TESTS_VALIDATION_FIXTURES_TRANSPOSEFIXTURE_H #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/Types.h" @@ -32,7 +32,7 @@ #include "tests/IAccessor.h" #include "tests/framework/Asserts.h" #include "tests/framework/Fixture.h" -#include "tests/validation/reference/Transpose.h" +#include "tests/validation/reference/Permute.h" namespace arm_compute { @@ -97,7 +97,7 @@ protected: // Fill reference fill(src); - return reference::transpose(src); + return reference::permute(src, PermutationVector(1U, 0U)); } TensorType _target{}; @@ -106,4 +106,4 @@ protected: } // namespace validation } // namespace test } // namespace arm_compute -#endif /* ARM_COMPUTE_TEST_TRANSPOSE_FIXTURE */ +#endif // ACL_TESTS_VALIDATION_FIXTURES_TRANSPOSEFIXTURE_H -- cgit v1.2.1