From 0c19f59279a88384074635bf273a99001602ed21 Mon Sep 17 00:00:00 2001 From: Viet-Hoa Do Date: Tue, 1 Aug 2023 14:42:41 +0100 Subject: Fix CL Tile operator Resolves: COMPMID-6404 Signed-off-by: Viet-Hoa Do Change-Id: I75aebe620567ed50817747589bbe8cfb63715a7b Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10036 Tested-by: Arm Jenkins Reviewed-by: TeresaARM Reviewed-by: Anitha Raj Reviewed-by: Pablo Marquez Tello Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- src/core/CL/cl_kernels/common/tile.cl | 31 ++++++++++++++----------------- src/core/CL/kernels/CLTileKernel.cpp | 13 ++++++++----- 2 files changed, 22 insertions(+), 22 deletions(-) (limited to 'src/core/CL') diff --git a/src/core/CL/cl_kernels/common/tile.cl b/src/core/CL/cl_kernels/common/tile.cl index 4332411688..971750b7b2 100644 --- a/src/core/CL/cl_kernels/common/tile.cl +++ b/src/core/CL/cl_kernels/common/tile.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -50,7 +50,7 @@ __kernel void tile( TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) { - Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH); + Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, DST_DEPTH); Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, SRC_DEPTH); // For all coordinates but x, each tile copies from the input @@ -62,22 +62,18 @@ __kernel void tile( // If we are loading/storing multiple elements at time, we need to // not exceed the input boundaries. The last threads need to backtrack // of OFFSET elements. Those elements cumulates for previous tiles - const int id = (int)(get_global_id(0)); - int x = id * VEC_SIZE; - // Shift x based on the previous offsets - const int tile_number = x / SRC_WIDTH; - x -= (tile_number) * OFFSET; - int x_input = x % SRC_WIDTH; + const int id = (int)(get_global_id(0)); + const int multiple_no = id / SRC_WIDTH_TILES; + const int tile_no = id % SRC_WIDTH_TILES; + const int last_tile = (int)(tile_no == SRC_WIDTH_TILES - 1); - // Shift x based on being the last tile - const int last_tile = (int)(x_input + VEC_SIZE > SRC_WIDTH); - x -= last_tile * OFFSET; - x_input = x % SRC_WIDTH; - output.ptr -= (tile_number + last_tile) * OFFSET * output_stride_x; + const int x_input = tile_no * VEC_SIZE - last_tile * OFFSET; + const int x_output = multiple_no * SRC_WIDTH + x_input; - // Update the input pointer - input.ptr = tensor4D_offset(&input, x_input, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES); + // Update the input and output pointers. + input.ptr = tensor4D_offset(&input, x_input, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES); + output.ptr = tensor4D_offset(&output, x_output, y, z, batch); // Copy the data VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) @@ -88,8 +84,9 @@ __kernel void tile( #else // !defined(VEC_SIZE) || !defined(OFFSET) const int x = get_global_id(0); - // Update the input pointer - input.ptr = tensor4D_offset(&input, x % SRC_WIDTH, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES); + // Update the input and output pointers. + input.ptr = tensor4D_offset(&input, x % SRC_WIDTH, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES); + output.ptr = tensor4D_offset(&output, x, y, z, batch); *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr)); #endif // defined(VEC_SIZE) && defined(OFFSET) diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp index 9c678a3f7e..3e7015cfd2 100644 --- a/src/core/CL/kernels/CLTileKernel.cpp +++ b/src/core/CL/kernels/CLTileKernel.cpp @@ -80,11 +80,13 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT _input = input; _output = output; - const DataType data_type = input->info()->data_type(); - const int vec_size_x = 16 / input->info()->element_size(); - const int input_width_x = input->info()->tensor_shape().x(); - const unsigned int offset = ceil_to_multiple(input_width_x, vec_size_x) - input_width_x; - const bool multi_access_x = (input_width_x / vec_size_x > 0); + const DataType data_type = input->info()->data_type(); + const int vec_size_x = 16 / input->info()->element_size(); + const int input_width_x = input->info()->tensor_shape().x(); + const unsigned int input_width_ceil = ceil_to_multiple(input_width_x, vec_size_x); + const unsigned int input_width_tiles = input_width_ceil / vec_size_x; + const unsigned int offset = input_width_ceil - input_width_x; + const bool multi_access_x = (input_width_x / vec_size_x > 0); // Create kernel CLBuildOptions build_opts; @@ -96,6 +98,7 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT build_opts.add_option("-DDST_DEPTH=" + support::cpp11::to_string(output->info()->dimension(2))); build_opts.add_option_if(multi_access_x, "-DOFFSET=" + support::cpp11::to_string(offset)); build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); + build_opts.add_option_if(multi_access_x, "-DSRC_WIDTH_TILES=" + support::cpp11::to_string(input_width_tiles)); _kernel = create_kernel(compile_context, "tile", build_opts.options()); // Configure window without padding -- cgit v1.2.1