Fix CL Tile operator

Resolves: COMPMID-6404 Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com> Change-Id: I75aebe620567ed50817747589bbe8cfb63715a7b Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10036 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: TeresaARM <teresa.charlinreyes@arm.com> Reviewed-by: Anitha Raj <Anitha.Raj@arm.com> Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: Viet-Hoa Do <viet-hoa.do@arm.com> 2023-08-01 14:42:41 +0100
committer: Viet-Hoa Do <viet-hoa.do@arm.com> 2023-08-03 10:33:26 +0000
commit: 0c19f59279a88384074635bf273a99001602ed21 (patch)
tree: c6522b58663538b9a8c8affdcfa938b1100d286d
parent: e98413e405015dec7e90946cc1e0c7b9921b0be3 (diff)
download: ComputeLibrary-0c19f59279a88384074635bf273a99001602ed21.tar.gz
5 files changed, 35 insertions, 24 deletions
diff --git a/docs/user_guide/errata.dox b/docs/user_guide/errata.dox
index d5689f40f0..525ad3e396 100644
--- a/docs/user_guide/errata.dox
+++ b/docs/user_guide/errata.dox
@@ -30,6 +30,15 @@ namespace arm_compute
 
 @section S7_1_errata Errata
 
+- (COMPMID-6404) Under certain conditions, CLTile may produce incorrect result.
+    - Versions: >= v19.02 && < v23.08
+    - OSs: Linux, Android.
+    - Conditions:
+        - The size of the lowest dimension of the input tensor is greater than 16 bytes.
+        - The size of the lowest dimension of the input tensor is not a multiple of 16.
+    - Result:
+        - Incorrect result is produced.
+
 - (COMPMID-6271) Under certain conditions, CLArgMinMaxLayer validation tests may fail
     - Versions Affected: >= v20.02 && < v23.08
     - OSs Affected: Linux
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index ce96370305..801f1f0b0f 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -43,12 +43,13 @@ If there is more than one release in a month then an extra sequential number is
 v23.08 Public major release
  - Deprecate the legacy 'libarm_compute_core' library. This library is an artifact of Compute Library's legacy library architecture and no longer serves any purpose.
  Users must no longer link their applications to this library and instead link only to the main `libarm_compute` library for core functionality.
- - Various optimizations and bug fixes.
  - New features
    - Add new OpenCL™ kernels:
      - @ref opencl::kernels::ClMatMulNativeMMULKernel support for FP32 and FP16, with batch support
    - Enable transposed convolution with non-square kernels on CPU and GPU.
    - Added support for input data type U64/S64 in CLCast.
+ - Various optimizations and bug fixes.
+
 v23.05.1 Public patch release
  - Enable CMake and Bazel option to build multi_isa without FP16 support.
  - Fix compilation error in NEReorderLayer (aarch64 only).
diff --git a/src/core/CL/cl_kernels/common/tile.cl b/src/core/CL/cl_kernels/common/tile.cl
index 4332411688..971750b7b2 100644
--- a/src/core/CL/cl_kernels/common/tile.cl
+++ b/src/core/CL/cl_kernels/common/tile.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,7 +50,7 @@ __kernel void tile(
     TENSOR4D_DECLARATION(input),
     TENSOR4D_DECLARATION(output))
 {
-    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, DST_DEPTH);
     Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, SRC_DEPTH);
 
     // For all coordinates but x, each tile copies from the input
@@ -62,22 +62,18 @@ __kernel void tile(
     // If we are loading/storing multiple elements at time, we need to
     // not exceed the input boundaries. The last threads need to backtrack
     // of OFFSET elements. Those elements cumulates for previous tiles
-    const int id = (int)(get_global_id(0));
-    int       x  = id * VEC_SIZE;
 
-    // Shift x based on the previous offsets
-    const int tile_number = x / SRC_WIDTH;
-    x -= (tile_number) * OFFSET;
-    int x_input = x % SRC_WIDTH;
+    const int id          = (int)(get_global_id(0));
+    const int multiple_no = id / SRC_WIDTH_TILES;
+    const int tile_no     = id % SRC_WIDTH_TILES;
+    const int last_tile   = (int)(tile_no == SRC_WIDTH_TILES - 1);
 
-    // Shift x based on being the last tile
-    const int last_tile = (int)(x_input + VEC_SIZE > SRC_WIDTH);
-    x -= last_tile * OFFSET;
-    x_input = x % SRC_WIDTH;
-    output.ptr -= (tile_number + last_tile) * OFFSET * output_stride_x;
+    const int x_input  = tile_no * VEC_SIZE - last_tile * OFFSET;
+    const int x_output = multiple_no * SRC_WIDTH + x_input;
 
-    // Update the input pointer
-    input.ptr = tensor4D_offset(&input, x_input, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+    // Update the input and output pointers.
+    input.ptr  = tensor4D_offset(&input, x_input, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+    output.ptr = tensor4D_offset(&output, x_output, y, z, batch);
 
     // Copy the data
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
@@ -88,8 +84,9 @@ __kernel void tile(
 #else  // !defined(VEC_SIZE) || !defined(OFFSET)
     const int x = get_global_id(0);
 
-    // Update the input pointer
-    input.ptr = tensor4D_offset(&input, x % SRC_WIDTH, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+    // Update the input and output pointers.
+    input.ptr  = tensor4D_offset(&input, x % SRC_WIDTH, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+    output.ptr = tensor4D_offset(&output, x, y, z, batch);
 
     *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr));
 #endif // defined(VEC_SIZE) && defined(OFFSET)
diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp
index 9c678a3f7e..3e7015cfd2 100644
--- a/src/core/CL/kernels/CLTileKernel.cpp
+++ b/src/core/CL/kernels/CLTileKernel.cpp
@@ -80,11 +80,13 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
     _input  = input;
     _output = output;
 
-    const DataType     data_type      = input->info()->data_type();
-    const int          vec_size_x     = 16 / input->info()->element_size();
-    const int          input_width_x  = input->info()->tensor_shape().x();
-    const unsigned int offset         = ceil_to_multiple(input_width_x, vec_size_x) - input_width_x;
-    const bool         multi_access_x = (input_width_x / vec_size_x > 0);
+    const DataType     data_type         = input->info()->data_type();
+    const int          vec_size_x        = 16 / input->info()->element_size();
+    const int          input_width_x     = input->info()->tensor_shape().x();
+    const unsigned int input_width_ceil  = ceil_to_multiple(input_width_x, vec_size_x);
+    const unsigned int input_width_tiles = input_width_ceil / vec_size_x;
+    const unsigned int offset            = input_width_ceil - input_width_x;
+    const bool         multi_access_x    = (input_width_x / vec_size_x > 0);
 
     // Create kernel
     CLBuildOptions build_opts;
@@ -96,6 +98,7 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
     build_opts.add_option("-DDST_DEPTH=" + support::cpp11::to_string(output->info()->dimension(2)));
     build_opts.add_option_if(multi_access_x, "-DOFFSET=" + support::cpp11::to_string(offset));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option_if(multi_access_x, "-DSRC_WIDTH_TILES=" + support::cpp11::to_string(input_width_tiles));
     _kernel = create_kernel(compile_context, "tile", build_opts.options());
 
     // Configure window without padding
diff --git a/tests/validation/CL/Tile.cpp b/tests/validation/CL/Tile.cpp
index a06c05744f..f243780c00 100644
--- a/tests/validation/CL/Tile.cpp
+++ b/tests/validation/CL/Tile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,6 +42,7 @@ namespace validation
 namespace
 {
 const auto MultiplesDataset = framework::dataset::make("Multiples", { Multiples{ 3 },
+                                                                      Multiples{ 7 },
                                                                       Multiples{ 2, 2 },
                                                                       Multiples{ 1, 1, 3, 4 },
                                                                       Multiples{ 2, 1, 2, 2 },
author	Viet-Hoa Do <viet-hoa.do@arm.com>	2023-08-01 14:42:41 +0100
committer	Viet-Hoa Do <viet-hoa.do@arm.com>	2023-08-03 10:33:26 +0000
commit	0c19f59279a88384074635bf273a99001602ed21 (patch)
tree	c6522b58663538b9a8c8affdcfa938b1100d286d
parent	e98413e405015dec7e90946cc1e0c7b9921b0be3 (diff)
download	ComputeLibrary-0c19f59279a88384074635bf273a99001602ed21.tar.gz