From 8bab0ee5f44a0e2cfe60d4d9e040a2f5ae4ef9b1 Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Thu, 13 Sep 2018 11:51:56 +0100
Subject: COMPMID-1584 - Collapse batch size in CLChannelShuffleLayerKernel
 COMPMID-1589 - Add support for NHWC to CLChannelShuffleLayerKernel

Change-Id: I13936a5cd1659d01fdb10b346e90f0d72d79f1f1
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/148475
Tested-by: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com>
---
 src/core/CL/CLKernelLibrary.cpp                    |   1 +
 src/core/CL/cl_kernels/channel_shuffle.cl          | 201 +++++++++++++--------
 .../CL/kernels/CLChannelShuffleLayerKernel.cpp     |  61 +++++--
 3 files changed, 169 insertions(+), 94 deletions(-)

(limited to 'src/core')

diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index ef3a431f1a..fa164542e4 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -167,6 +167,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "channel_combine_UYVY422", "channel_combine.cl" },
     { "channel_combine_YUYV422", "channel_combine.cl" },
     { "channel_shuffle_nchw", "channel_shuffle.cl" },
+    { "channel_shuffle_nhwc", "channel_shuffle.cl" },
     { "channel_extract_NV12", "channel_extract.cl" },
     { "channel_extract_NV21", "channel_extract.cl" },
     { "channel_extract_RGB888", "channel_extract.cl" },
diff --git a/src/core/CL/cl_kernels/channel_shuffle.cl b/src/core/CL/cl_kernels/channel_shuffle.cl
index 23962e1c2e..3ac67c58ae 100644
--- a/src/core/CL/cl_kernels/channel_shuffle.cl
+++ b/src/core/CL/cl_kernels/channel_shuffle.cl
@@ -23,19 +23,28 @@
 */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(BLOCK_SIZE) && defined(NUM_GROUPS) && defined(K)
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
 
-// Check valid BLOCK_SIZES
-#if BLOCK_SIZE != 4 && BLOCK_SIZE != 8 && BLOCK_SIZE != 16
-#error "Only block sizes 4, 8 and 16 are supported"
-#endif /* BLOCK_SIZE != 4 && BLOCK_SIZE != 8 && BLOCK_SIZE != 16 */
+// Check valid VEC_SIZES
+#if VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+#error "Only vector sizes 4, 8 and 16 are supported"
+#endif // VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
 
-#define TYPE VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
 
-/** Perfoms channel shuffle see https://arxiv.org/pdf/1707.01083.pdf for details.
+#define DIV_MOD_UINT(x, y, div_res, mod_res)                \
+    ({                                                      \
+        div_res = (uint)((x) * (float)(1.0f / (float)(y))); \
+        uint r  = div_res * (y);                            \
+        mod_res = (x)-r;                                    \
+    })
+
+/** Performs channel shuffle when the data layout is NCHW. See https://arxiv.org/pdf/1707.01083.pdf for details.
  *
- * @note The number of groups should be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
- * @note The number of channels in each group should be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
+ * @note The depth of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
  *       K is equal to num_channels / num_groups.
  *
  * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
@@ -45,6 +54,8 @@
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the first source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
@@ -53,80 +64,118 @@
  * @param[in]  dst_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void channel_shuffle_nchw(TENSOR3D_DECLARATION(src),
-                                   TENSOR3D_DECLARATION(dst))
+__kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src),
+                                   TENSOR4D_DECLARATION(dst))
 {
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
+    uint curr_channel = 0; // channel id of input
+    uint batch_id     = 0; // batch id
+    uint group_id     = 0; // group id
+    uint channel_id   = 0; // channel id within the group
+
+    // Compute curr_channel and batch_id
+    DIV_MOD_UINT(get_global_id(2), SRC_DIM_Z, batch_id, curr_channel);
 
-    const uint curr_channel = get_global_id(2); // channel id of input
-    const uint group_id     = curr_channel / K; // group id
-    const uint channel_id   = curr_channel % K; // channel id within the group
+    // Compute group_id and channel_id
+    DIV_MOD_UINT(curr_channel, K, group_id, channel_id);
 
-    const uint x = get_global_id(0) * BLOCK_SIZE;
-    const uint y = get_global_id(1) * BLOCK_SIZE;
+    const uint x = get_global_id(0) * VEC_SIZE;
+    const uint y = get_global_id(1) * 2;
     const uint z = channel_id * NUM_GROUPS + group_id;
 
-    // Load the NxN block
-    TYPE u0 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 0, 0));
-    TYPE u1 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 1, 0));
-    TYPE u2 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 2, 0));
-    TYPE u3 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 3, 0));
-#if BLOCK_SIZE > 4
-    TYPE u4 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 4, 0));
-    TYPE u5 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 5, 0));
-    TYPE u6 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 6, 0));
-    TYPE u7 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 7, 0));
-#if BLOCK_SIZE == 16
-    TYPE u8  = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 8, 0));
-    TYPE u9  = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 9, 0));
-    TYPE u10 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 10, 0));
-    TYPE u11 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 11, 0));
-    TYPE u12 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 12, 0));
-    TYPE u13 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 13, 0));
-    TYPE u14 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 14, 0));
-    TYPE u15 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 15, 0));
-#endif /* BLOCK_SIZE == 16 */
-#endif /* BLOCK_SIZE > 4 */
+    // Load the Nx2 block
+    const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w;
+    TYPE u0                         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+    TYPE u1                         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+
+    // Store blocks
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
+    VSTORE(VEC_SIZE)
+    (u0, 0, (__global DATA_TYPE *)(output_ptr + 0 * dst_stride_y));
+    VSTORE(VEC_SIZE)
+    (u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y));
+}
+
+#if VEC_SIZE == 4 && defined(LAST_ACCESSED)
+/** Performs channel shuffle when the data layout is NHWC. See https://arxiv.org/pdf/1707.01083.pdf for details.
+ *
+ * @note This implementation is only defined for VEC_SIZE = 4
+ * @note This last element accessed along the first dimension must be given as a preprocessor argument using -DLAST_ACCESSED=num. e.g. -DLAST_ACCESSED=64 in order to prevent out-of-bound writes.
+ * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
+ * @note The height of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ *       K is equal to num_channels / num_groups.
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void channel_shuffle_nhwc(TENSOR4D_DECLARATION(src),
+                                   TENSOR4D_DECLARATION(dst))
+{
+    const uint curr_channel = min((uint)(get_global_id(0) * VEC_SIZE), (uint)LAST_ACCESSED); // input feature map
+    uint       channel_id0  = 0;
+    uint       channel_id1  = 0;
+    uint       channel_id2  = 0;
+    uint       channel_id3  = 0;
+    uint       group_id0    = 0;
+    uint       group_id1    = 0;
+    uint       group_id2    = 0;
+    uint       group_id3    = 0;
+    uint       y            = 0;
+    uint       batch_id     = 0;
+
+    // Compute curr_channel and batch_id
+    DIV_MOD_UINT(get_global_id(2), (uint)SRC_DIM_Z, batch_id, y);
+
+    // Compute group_id and channel_id
+    DIV_MOD_UINT(curr_channel + (uint)0, K, group_id0, channel_id0);
+    DIV_MOD_UINT(curr_channel + (uint)1, K, group_id1, channel_id1);
+    DIV_MOD_UINT(curr_channel + (uint)2, K, group_id2, channel_id2);
+    DIV_MOD_UINT(curr_channel + (uint)3, K, group_id3, channel_id3);
+
+    const uint x  = get_global_id(1) * 2;
+    const uint z0 = channel_id0 * (uint)NUM_GROUPS + group_id0;
+    const uint z1 = channel_id1 * (uint)NUM_GROUPS + group_id1;
+    const uint z2 = channel_id2 * (uint)NUM_GROUPS + group_id2;
+    const uint z3 = channel_id3 * (uint)NUM_GROUPS + group_id3;
+
+    // Load the Nx2 block
+    const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + curr_channel * sizeof(DATA_TYPE) + x * src_stride_y + y * src_stride_z + batch_id * src_stride_w;
+    TYPE u0                         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+    TYPE u1                         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
 
     // Store blocks
-    VSTORE(BLOCK_SIZE)
-    (u0, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 0, z));
-    VSTORE(BLOCK_SIZE)
-    (u1, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 1, z));
-    VSTORE(BLOCK_SIZE)
-    (u2, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 2, z));
-    VSTORE(BLOCK_SIZE)
-    (u3, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 3, z));
-#if BLOCK_SIZE > 4
-    VSTORE(BLOCK_SIZE)
-    (u4, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 4, z));
-    VSTORE(BLOCK_SIZE)
-    (u5, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 5, z));
-    VSTORE(BLOCK_SIZE)
-    (u6, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 6, z));
-    VSTORE(BLOCK_SIZE)
-    (u7, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 7, z));
-#if BLOCK_SIZE == 16
-    VSTORE(BLOCK_SIZE)
-    (u8, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 8, z));
-    VSTORE(BLOCK_SIZE)
-    (u9, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 9, z));
-    VSTORE(BLOCK_SIZE)
-    (u10, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 10, z));
-    VSTORE(BLOCK_SIZE)
-    (u11, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 11, z));
-    VSTORE(BLOCK_SIZE)
-    (u12, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 12, z));
-    VSTORE(BLOCK_SIZE)
-    (u13, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 13, z));
-    VSTORE(BLOCK_SIZE)
-    (u14, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 14, z));
-    VSTORE(BLOCK_SIZE)
-    (u15, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 15, z));
-#endif /* BLOCK_SIZE == 16 */
-#endif /* BLOCK_SIZE > 4 */
+    __global uchar *output_ptr                                                              = dst_ptr + dst_offset_first_element_in_bytes + x * dst_stride_y + y * dst_stride_z + batch_id * dst_stride_w;
+    *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z0 * sizeof(DATA_TYPE))) = u0.s0;
+    *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z1 * sizeof(DATA_TYPE))) = u0.s1;
+    *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z2 * sizeof(DATA_TYPE))) = u0.s2;
+    *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z3 * sizeof(DATA_TYPE))) = u0.s3;
+    *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z0 * sizeof(DATA_TYPE))) = u1.s0;
+    *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z1 * sizeof(DATA_TYPE))) = u1.s1;
+    *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z2 * sizeof(DATA_TYPE))) = u1.s2;
+    *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z3 * sizeof(DATA_TYPE))) = u1.s3;
 }
-#endif /* defined(DATA_TYPE) && defined(BLOCK_SIZE) && defined(NUM_GROUPS) && defined(K) */
+#endif // VEC_SIZE == 4 && defined(LAST_ACCESSED)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index be4d68770d..53a54564d6 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
@@ -67,18 +67,22 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output, *input->clone());
 
-    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / input->element_size();
+    const bool             is_nhwc                             = input->data_layout() == DataLayout::NHWC;
+    const unsigned int     num_elems_processed_per_iteration_x = is_nhwc ? 4 : max_cl_vector_width / input->element_size();
+    constexpr unsigned int num_elems_processed_per_iteration_y = 2;
 
     // Configure kernel window
-    Window                win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
-    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
-    AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+    Window                win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
 
     const bool window_changed = update_window_and_padding(win, input_access, output_access);
     output_access.set_valid_region(win, input->valid_region());
 
+    Window win_collapsed = win.collapse(win, Window::DimZ);
+
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    return std::make_pair(err, win_collapsed);
 }
 } // namespace
 
@@ -96,14 +100,19 @@ void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *o
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
 
-    const unsigned int channels   = input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL));
-    const unsigned int block_size = max_cl_vector_width / input->info()->element_size();
+    const DataLayout   data_layout = input->info()->data_layout();
+    const bool         is_nhwc     = data_layout == DataLayout::NHWC;
+    const unsigned int channels    = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+    const unsigned int vec_size    = is_nhwc ? 4 : max_cl_vector_width / input->info()->element_size();
 
     // Set kernel build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
     build_opts.add_option("-DK=" + support::cpp11::to_string(channels / num_groups));
-    build_opts.add_option("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+    build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+    build_opts.add_option("-DLAST_ACCESSED=" + support::cpp11::to_string(std::max(static_cast<int>(channels - vec_size), 0)));
+
     switch(input->info()->element_size())
     {
         case 1:
@@ -120,12 +129,33 @@ void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *o
     }
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("channel_shuffle_nchw", build_opts.options()));
+    std::string kernel_name = "channel_shuffle_" + lower_string(string_from_data_layout(data_layout));
+    ;
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Configure kernel window
     auto win_config = validate_and_configure_window(input->info(), output->info());
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(num_groups);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(2));
 }
 
 Status CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
@@ -141,14 +171,9 @@ void CLChannelShuffleLayerKernel::run(const Window &window, cl::CommandQueue &qu
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    Window slice = window.first_slice_window_3D();
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice);
-    }
-    while(window.slide_window_slice_3D(slice));
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, window);
+    add_4D_tensor_argument(idx, _output, window);
+    enqueue(queue, *this, window, lws_hint());
 }
 } // namespace arm_compute
-- 
cgit v1.2.1