1 files changed, 26 insertions, 21 deletions
diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp
index c0c3d2e2ee..fa996c4008 100644
--- a/src/core/CL/kernels/CLTileKernel.cpp
+++ b/src/core/CL/kernels/CLTileKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,8 +22,11 @@
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLTileKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -38,15 +41,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty());
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e)
-    {
-        return e == 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) { return e == 0; }));
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
@@ -54,9 +55,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-CLTileKernel::CLTileKernel()
-    : _input(nullptr), _output(nullptr)
+CLTileKernel::CLTileKernel() : _input(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLTileKernel::configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
@@ -64,7 +65,10 @@ void CLTileKernel::configure(const ICLTensor *input, ICLTensor *output, const Mu
     configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples);
 }
 
-void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+void CLTileKernel::configure(const CLCompileContext &compile_context,
+                             const ICLTensor        *input,
+                             ICLTensor              *output,
+                             const Multiples        &multiples)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -78,11 +82,13 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
     _input  = input;
     _output = output;
 
-    const DataType     data_type      = input->info()->data_type();
-    const int          vec_size_x     = 16 / input->info()->element_size();
-    const int          input_width_x  = input->info()->tensor_shape().x();
-    const unsigned int offset         = ceil_to_multiple(input_width_x, vec_size_x) - input_width_x;
-    const bool         multi_access_x = (input_width_x / vec_size_x > 0);
+    const DataType     data_type         = input->info()->data_type();
+    const int          vec_size_x        = 16 / input->info()->element_size();
+    const int          input_width_x     = input->info()->tensor_shape().x();
+    const unsigned int input_width_ceil  = ceil_to_multiple(input_width_x, vec_size_x);
+    const unsigned int input_width_tiles = input_width_ceil / vec_size_x;
+    const unsigned int offset            = input_width_ceil - input_width_x;
+    const bool         multi_access_x    = (input_width_x / vec_size_x > 0);
 
     // Create kernel
     CLBuildOptions build_opts;
@@ -94,20 +100,20 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
     build_opts.add_option("-DDST_DEPTH=" + support::cpp11::to_string(output->info()->dimension(2)));
     build_opts.add_option_if(multi_access_x, "-DOFFSET=" + support::cpp11::to_string(offset));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option_if(multi_access_x, "-DSRC_WIDTH_TILES=" + support::cpp11::to_string(input_width_tiles));
     _kernel = create_kernel(compile_context, "tile", build_opts.options());
 
     // Configure window without padding
     Window win = calculate_max_window(*output->info());
 
-    if(multi_access_x)
+    if (multi_access_x)
     {
         // If multi-access is enabled, no thread should cross the tile boundaries. This means we need
         // as many threads as those to cover a single tile times multiples[0]. Note that if threads
         // do not cross the boundaries of the tiles, they won't cross the boundaries of the last tile, and
         // we don't need to pad the output
         const unsigned int size_win_x = ceil_to_multiple(input->info()->dimension(0), vec_size_x) * multiples[0];
-        win.set(Window::DimX,
-                Window::Dimension(win.x().start(), size_win_x, vec_size_x));
+        win.set(Window::DimX, Window::Dimension(win.x().start(), size_win_x, vec_size_x));
     }
 
     ICLKernel::configure_internal(win);
@@ -116,7 +122,7 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
     _config_id = "tile";
     _config_id += "_";
     _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    for(unsigned int i = 0; i < multiples.size(); ++i)
+    for (unsigned int i = 0; i < multiples.size(); ++i)
     {
         _config_id += "_";
         _config_id += support::cpp11::to_string(input->info()->dimension(i));
@@ -145,7 +151,6 @@ void CLTileKernel::run(const Window &window, cl::CommandQueue &queue)
         add_4D_tensor_argument(idx, _input, slice);
         add_4D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_4D(slice));
+    } while (collapsed.slide_window_slice_4D(slice));
 }
 } // namespace arm_compute