From 0c19f59279a88384074635bf273a99001602ed21 Mon Sep 17 00:00:00 2001
From: Viet-Hoa Do <viet-hoa.do@arm.com>
Date: Tue, 1 Aug 2023 14:42:41 +0100
Subject: Fix CL Tile operator

Resolves: COMPMID-6404
Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Change-Id: I75aebe620567ed50817747589bbe8cfb63715a7b
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10036
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: TeresaARM <teresa.charlinreyes@arm.com>
Reviewed-by: Anitha Raj <Anitha.Raj@arm.com>
Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/CL/cl_kernels/common/tile.cl | 31 ++++++++++++++-----------------
 src/core/CL/kernels/CLTileKernel.cpp  | 13 ++++++++-----
 2 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'src/core/CL')

diff --git a/src/core/CL/cl_kernels/common/tile.cl b/src/core/CL/cl_kernels/common/tile.cl
index 4332411688..971750b7b2 100644
--- a/src/core/CL/cl_kernels/common/tile.cl
+++ b/src/core/CL/cl_kernels/common/tile.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,7 +50,7 @@ __kernel void tile(
     TENSOR4D_DECLARATION(input),
     TENSOR4D_DECLARATION(output))
 {
-    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, DST_DEPTH);
     Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, SRC_DEPTH);
 
     // For all coordinates but x, each tile copies from the input
@@ -62,22 +62,18 @@ __kernel void tile(
     // If we are loading/storing multiple elements at time, we need to
     // not exceed the input boundaries. The last threads need to backtrack
     // of OFFSET elements. Those elements cumulates for previous tiles
-    const int id = (int)(get_global_id(0));
-    int       x  = id * VEC_SIZE;
 
-    // Shift x based on the previous offsets
-    const int tile_number = x / SRC_WIDTH;
-    x -= (tile_number) * OFFSET;
-    int x_input = x % SRC_WIDTH;
+    const int id          = (int)(get_global_id(0));
+    const int multiple_no = id / SRC_WIDTH_TILES;
+    const int tile_no     = id % SRC_WIDTH_TILES;
+    const int last_tile   = (int)(tile_no == SRC_WIDTH_TILES - 1);
 
-    // Shift x based on being the last tile
-    const int last_tile = (int)(x_input + VEC_SIZE > SRC_WIDTH);
-    x -= last_tile * OFFSET;
-    x_input = x % SRC_WIDTH;
-    output.ptr -= (tile_number + last_tile) * OFFSET * output_stride_x;
+    const int x_input  = tile_no * VEC_SIZE - last_tile * OFFSET;
+    const int x_output = multiple_no * SRC_WIDTH + x_input;
 
-    // Update the input pointer
-    input.ptr = tensor4D_offset(&input, x_input, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+    // Update the input and output pointers.
+    input.ptr  = tensor4D_offset(&input, x_input, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+    output.ptr = tensor4D_offset(&output, x_output, y, z, batch);
 
     // Copy the data
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
@@ -88,8 +84,9 @@ __kernel void tile(
 #else  // !defined(VEC_SIZE) || !defined(OFFSET)
     const int x = get_global_id(0);
 
-    // Update the input pointer
-    input.ptr = tensor4D_offset(&input, x % SRC_WIDTH, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+    // Update the input and output pointers.
+    input.ptr  = tensor4D_offset(&input, x % SRC_WIDTH, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+    output.ptr = tensor4D_offset(&output, x, y, z, batch);
 
     *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr));
 #endif // defined(VEC_SIZE) && defined(OFFSET)
diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp
index 9c678a3f7e..3e7015cfd2 100644
--- a/src/core/CL/kernels/CLTileKernel.cpp
+++ b/src/core/CL/kernels/CLTileKernel.cpp
@@ -80,11 +80,13 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
     _input  = input;
     _output = output;
 
-    const DataType     data_type      = input->info()->data_type();
-    const int          vec_size_x     = 16 / input->info()->element_size();
-    const int          input_width_x  = input->info()->tensor_shape().x();
-    const unsigned int offset         = ceil_to_multiple(input_width_x, vec_size_x) - input_width_x;
-    const bool         multi_access_x = (input_width_x / vec_size_x > 0);
+    const DataType     data_type         = input->info()->data_type();
+    const int          vec_size_x        = 16 / input->info()->element_size();
+    const int          input_width_x     = input->info()->tensor_shape().x();
+    const unsigned int input_width_ceil  = ceil_to_multiple(input_width_x, vec_size_x);
+    const unsigned int input_width_tiles = input_width_ceil / vec_size_x;
+    const unsigned int offset            = input_width_ceil - input_width_x;
+    const bool         multi_access_x    = (input_width_x / vec_size_x > 0);
 
     // Create kernel
     CLBuildOptions build_opts;
@@ -96,6 +98,7 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
     build_opts.add_option("-DDST_DEPTH=" + support::cpp11::to_string(output->info()->dimension(2)));
     build_opts.add_option_if(multi_access_x, "-DOFFSET=" + support::cpp11::to_string(offset));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option_if(multi_access_x, "-DSRC_WIDTH_TILES=" + support::cpp11::to_string(input_width_tiles));
     _kernel = create_kernel(compile_context, "tile", build_opts.options());
 
     // Configure window without padding
-- 
cgit v1.2.1