diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/core/CL/cl_kernels/common/transpose.cl | 17 | ||||
-rw-r--r-- | src/cpu/kernels/CpuTransposeKernel.cpp | 5 | ||||
-rw-r--r-- | src/gpu/cl/kernels/ClTransposeKernel.cpp | 46 | ||||
-rw-r--r-- | src/gpu/cl/kernels/ClTransposeKernel.h | 10 |
4 files changed, 55 insertions, 23 deletions
diff --git a/src/core/CL/cl_kernels/common/transpose.cl b/src/core/CL/cl_kernels/common/transpose.cl index 82db2908b5..5b4c68ca10 100644 --- a/src/core/CL/cl_kernels/common/transpose.cl +++ b/src/core/CL/cl_kernels/common/transpose.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -124,23 +124,28 @@ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source matrix in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as src_ptr * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination matrix in Z dimension (in bytes) + * @param[in] dst_step_z dst_gx_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix */ -__kernel void transpose(IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) +__kernel void transpose(TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) { uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0); uint y_offs = max((int)(get_global_id(1) * VEC_SIZE_Y - (VEC_SIZE_Y - VEC_SIZE_LEFTOVER_Y) % VEC_SIZE_Y), 0); + uint z_offs = get_global_id(2); // Compute addresses - __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * DATA_TYPE_IN_BYTES + y_offs * src_stride_y; - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + y_offs * DATA_TYPE_IN_BYTES + x_offs * dst_stride_y; + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * DATA_TYPE_IN_BYTES + y_offs * src_stride_y + z_offs * src_stride_z; + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + y_offs * DATA_TYPE_IN_BYTES + x_offs * dst_stride_y + z_offs * dst_stride_z; // Load the NxM block at (x, y) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) @@ -237,4 +242,4 @@ __kernel void transpose(IMAGE_DECLARATION(src), VEC_SIZE_LEFTOVER_Y != 0 && get_global_id(1) == 0); } -#endif // defined(DATA_TYPE_IN_BYTES) && defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X) && defined(VEC_SIZE_Y) && defined(VEC_SIZE_LEFTOVER_Y)
\ No newline at end of file +#endif // defined(DATA_TYPE_IN_BYTES) && defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X) && defined(VEC_SIZE_Y) && defined(VEC_SIZE_LEFTOVER_Y) diff --git a/src/cpu/kernels/CpuTransposeKernel.cpp b/src/cpu/kernels/CpuTransposeKernel.cpp index 615bc6ce1e..0f762ba041 100644 --- a/src/cpu/kernels/CpuTransposeKernel.cpp +++ b/src/cpu/kernels/CpuTransposeKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -737,6 +737,9 @@ void CpuTransposeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src); auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); + // Explicitly set the tensor shape to preserve dimensions + dst->set_tensor_shape(dst_shape); + // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst)); diff --git a/src/gpu/cl/kernels/ClTransposeKernel.cpp b/src/gpu/cl/kernels/ClTransposeKernel.cpp index 6eb2bf81c0..f95a215107 100644 --- a/src/gpu/cl/kernels/ClTransposeKernel.cpp +++ b/src/gpu/cl/kernels/ClTransposeKernel.cpp @@ -58,14 +58,37 @@ void ClTransposeKernel::configure(const CLCompileContext &compile_context, const const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src); auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); + // Explicitly set the tensor shape to preserve dimensions + dst->set_tensor_shape(dst_shape); + ARM_COMPUTE_ERROR_THROW_ON(ClTransposeKernel::validate(src, dst)); auto padding_info = get_padding_info({src, dst}); - // Create kernel - const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0)); - const int vec_size_x_leftovers = src->dimension(0) % vec_size_x; - const unsigned int vec_size_y = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(1)); - const int vec_size_y_leftovers = src->dimension(1) % vec_size_y; + unsigned int vec_size_x; + unsigned int vec_size_y; + + // Set the optimal tile size for each data type without register spilling + switch (src->element_size()) + { + case 1: + vec_size_x = adjust_vec_size(8, src->dimension(0)); + vec_size_y = adjust_vec_size(16, src->dimension(1)); + break; + case 2: + vec_size_x = adjust_vec_size(8, src->dimension(0)); + vec_size_y = adjust_vec_size(8, src->dimension(1)); + break; + case 4: + vec_size_x = adjust_vec_size(4, src->dimension(0)); + vec_size_y = adjust_vec_size(8, src->dimension(1)); + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type"); + break; + } + + const int vec_size_x_leftovers = src->dimension(0) % vec_size_x; + const int vec_size_y_leftovers = src->dimension(1) % vec_size_y; CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE_IN_BYTES=" + support::cpp11::to_string(src->element_size())); @@ -78,7 +101,7 @@ void ClTransposeKernel::configure(const CLCompileContext &compile_context, const // Configure kernel window Window win = calculate_max_window(*src, Steps(vec_size_x, vec_size_y)); - ICLKernel::configure_internal(win, cl::NDRange(2, 8)); + ICLKernel::configure_internal(win); ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } @@ -87,7 +110,6 @@ Status ClTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *ds ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 2, "Transpose up to 2-D src tensor is supported"); // Validate configured dst if (dst->total_size() != 0) @@ -112,15 +134,17 @@ void ClTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cl::C utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); - Window slice = window.first_slice_window_2D(); + // Collapse dimensions higher than width and height into the batch dimension + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); do { unsigned int idx = 0; - add_2D_tensor_argument(idx, src, slice); - add_2D_tensor_argument(idx, dst, slice); + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } while (window.slide_window_slice_2D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClTransposeKernel.h b/src/gpu/cl/kernels/ClTransposeKernel.h index b30d6f0281..eaad38b20f 100644 --- a/src/gpu/cl/kernels/ClTransposeKernel.h +++ b/src/gpu/cl/kernels/ClTransposeKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H -#define ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H +#ifndef ACL_SRC_GPU_CL_KERNELS_CLTRANSPOSEKERNEL_H +#define ACL_SRC_GPU_CL_KERNELS_CLTRANSPOSEKERNEL_H #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" @@ -34,7 +34,7 @@ namespace opencl { namespace kernels { -/** OpenCL kernel to transpose a 2D tensor. */ +/** OpenCL kernel to transpose a tensor. Only the first two dimensions (width, height) are transposed. */ class ClTransposeKernel : public IClKernel { public: @@ -61,4 +61,4 @@ public: } // namespace kernels } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H */ +#endif // ACL_SRC_GPU_CL_KERNELS_CLTRANSPOSEKERNEL_H |