From e57e11d66b8f1e49eff180bc65f877bf88c4c4cf Mon Sep 17 00:00:00 2001
From: Manuel Bottini <manuel.bottini@arm.com>
Date: Tue, 11 May 2021 17:36:14 +0100
Subject: Remove padding from CLChannelShuffleLayerKernel

Only for NHWC
Instead of starting from the input vector and insert the values in the output,
we now create an output vector by taking from the input tensor. This makes it
easier to store the result in the output with border awareness

Resolves: COMPMID-4445

Change-Id: I77cf2d2d5ff30a383cddb8a3c81c462d7a39fd2e
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5627
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
---
 src/core/CL/cl_kernels/channel_shuffle.cl          | 147 +++++++++++++--------
 .../CL/kernels/CLChannelShuffleLayerKernel.cpp     |  63 ++++++---
 2 files changed, 138 insertions(+), 72 deletions(-)

diff --git a/src/core/CL/cl_kernels/channel_shuffle.cl b/src/core/CL/cl_kernels/channel_shuffle.cl
index 9a87eb4af3..b7272da33a 100644
--- a/src/core/CL/cl_kernels/channel_shuffle.cl
+++ b/src/core/CL/cl_kernels/channel_shuffle.cl
@@ -1,5 +1,5 @@
 /*
-* Copyright (c) 2018-2020 Arm Limited.
+* Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,15 +22,14 @@
  * SOFTWARE.
  */
 #include "helpers.h"
+#include "tile_helpers.h"
 
 #if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
 
 // Check valid VEC_SIZES
-#if VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
-#error "Only vector sizes 4, 8 and 16 are supported"
-#endif // VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
-
-#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#if VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+#error "Only vector sizes 1, 2, 3, 4, 8 and 16 are supported"
+#endif // VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
 
 #define DIV_MOD_UINT(x, y, div_res, mod_res)                \
     ({                                                      \
@@ -88,8 +87,10 @@ __kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src),
 
     // Load the Nx2 block
     const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w;
-    TYPE u0                         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
-    TYPE u1                         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
 
     // Store blocks
     __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
@@ -99,16 +100,17 @@ __kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src),
     (u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y));
 }
 
-#if VEC_SIZE == 4 && defined(LAST_ACCESSED)
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X)
+
 /** Performs channel shuffle when the data layout is NHWC. See https://arxiv.org/pdf/1707.01083.pdf for details.
  *
- * @note This implementation is only defined for VEC_SIZE = 4
- * @note This last element accessed along the first dimension must be given as a preprocessor argument using -DLAST_ACCESSED=num. e.g. -DLAST_ACCESSED=64 in order to prevent out-of-bound writes.
  * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
- * @note The height of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The third dimension of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The first dimension of the tensor must be given as a preprocessor argument using -DSRC_DIM_X=num. e.g. -DSRC_DIM_X=64
  * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
  * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
  *       K is equal to num_channels / num_groups.
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1
  *
  * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
@@ -134,48 +136,87 @@ __kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src),
 __kernel void channel_shuffle_nhwc(TENSOR4D_DECLARATION(src),
                                    TENSOR4D_DECLARATION(dst))
 {
-    const uint curr_channel = min((uint)(get_global_id(0) * VEC_SIZE), (uint)LAST_ACCESSED); // input feature map
-    uint       channel_id0  = 0;
-    uint       channel_id1  = 0;
-    uint       channel_id2  = 0;
-    uint       channel_id3  = 0;
-    uint       group_id0    = 0;
-    uint       group_id1    = 0;
-    uint       group_id2    = 0;
-    uint       group_id3    = 0;
-    uint       y            = 0;
-    uint       batch_id     = 0;
+    // Offset computation
+    const uint curr_out_channel = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER); // output feature map
 
+    uint z        = 0;
+    uint batch_id = 0;
     // Compute curr_channel and batch_id
-    DIV_MOD_UINT(get_global_id(2), (uint)SRC_DIM_Z, batch_id, y);
-
-    // Compute group_id and channel_id
-    DIV_MOD_UINT(curr_channel + (uint)0, K, group_id0, channel_id0);
-    DIV_MOD_UINT(curr_channel + (uint)1, K, group_id1, channel_id1);
-    DIV_MOD_UINT(curr_channel + (uint)2, K, group_id2, channel_id2);
-    DIV_MOD_UINT(curr_channel + (uint)3, K, group_id3, channel_id3);
-
-    const uint x  = get_global_id(1) * 2;
-    const uint z0 = channel_id0 * (uint)NUM_GROUPS + group_id0;
-    const uint z1 = channel_id1 * (uint)NUM_GROUPS + group_id1;
-    const uint z2 = channel_id2 * (uint)NUM_GROUPS + group_id2;
-    const uint z3 = channel_id3 * (uint)NUM_GROUPS + group_id3;
-
-    // Load the Nx2 block
-    const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + curr_channel * sizeof(DATA_TYPE) + x * src_stride_y + y * src_stride_z + batch_id * src_stride_w;
-    TYPE u0                         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
-    TYPE u1                         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
-
-    // Store blocks
-    __global uchar *output_ptr                                                              = dst_ptr + dst_offset_first_element_in_bytes + x * dst_stride_y + y * dst_stride_z + batch_id * dst_stride_w;
-    *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z0 * sizeof(DATA_TYPE))) = u0.s0;
-    *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z1 * sizeof(DATA_TYPE))) = u0.s1;
-    *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z2 * sizeof(DATA_TYPE))) = u0.s2;
-    *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z3 * sizeof(DATA_TYPE))) = u0.s3;
-    *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z0 * sizeof(DATA_TYPE))) = u1.s0;
-    *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z1 * sizeof(DATA_TYPE))) = u1.s1;
-    *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z2 * sizeof(DATA_TYPE))) = u1.s2;
-    *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z3 * sizeof(DATA_TYPE))) = u1.s3;
+    DIV_MOD_UINT(get_global_id(2), (uint)SRC_DIM_Z, batch_id, z);
+
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    curr_out_channels = (VEC_DATA_TYPE(uint, VEC_SIZE))(curr_out_channel) + VEC_OFFS(uint, VEC_SIZE);
+
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    in_channels = (curr_out_channels * (VEC_DATA_TYPE(uint, VEC_SIZE))(K)) % (VEC_DATA_TYPE(uint, VEC_SIZE))(SRC_DIM_X) + (curr_out_channels / (VEC_DATA_TYPE(uint, VEC_SIZE))(NUM_GROUPS));
+
+    // Load the values
+    const __global DATA_TYPE *input_ptr = (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + z * src_stride_z + batch_id * src_stride_w);
+
+#if VEC_SIZE == 1
+    DATA_TYPE out0 = *((const __global * DATA_TYPE)(input_ptr) + in_channels);
+#elif VEC_SIZE == 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out0 =
+    {
+        *(input_ptr + in_channels.s0),
+        *(input_ptr + in_channels.s1)
+    };
+#elif VEC_SIZE == 3
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    out0 =
+    {
+        *(input_ptr + in_channels.s0),
+        *(input_ptr + in_channels.s1),
+        *(input_ptr + in_channels.s2)
+    };
+#elif VEC_SIZE == 4
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0 =
+    {
+        *(input_ptr + in_channels.s0),
+        *(input_ptr + in_channels.s1),
+        *(input_ptr + in_channels.s2),
+        *(input_ptr + in_channels.s3)
+    };
+#elif VEC_SIZE == 8
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0 =
+    {
+        *(input_ptr + in_channels.s0),
+        *(input_ptr + in_channels.s1),
+        *(input_ptr + in_channels.s2),
+        *(input_ptr + in_channels.s3),
+        *(input_ptr + in_channels.s4),
+        *(input_ptr + in_channels.s5),
+        *(input_ptr + in_channels.s6),
+        *(input_ptr + in_channels.s7)
+    };
+#elif VEC_SIZE == 16
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0 =
+    {
+        *(input_ptr + in_channels.s0),
+        *(input_ptr + in_channels.s1),
+        *(input_ptr + in_channels.s2),
+        *(input_ptr + in_channels.s3),
+        *(input_ptr + in_channels.s4),
+        *(input_ptr + in_channels.s5),
+        *(input_ptr + in_channels.s6),
+        *(input_ptr + in_channels.s7),
+        *(input_ptr + in_channels.s8),
+        *(input_ptr + in_channels.s9),
+        *(input_ptr + in_channels.sa),
+        *(input_ptr + in_channels.sb),
+        *(input_ptr + in_channels.sc),
+        *(input_ptr + in_channels.sd),
+        *(input_ptr + in_channels.se),
+        *(input_ptr + in_channels.sf)
+    };
+#endif // VEC_SIZE == 1
+
+    __global uchar *output_ptr = dst_ptr + curr_out_channel * sizeof(DATA_TYPE) + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
+    STORE_VECTOR_SELECT(out, DATA_TYPE, output_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
-#endif // VEC_SIZE == 4 && defined(LAST_ACCESSED)
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X)
 #endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index c774e98e44..8a6b58002c 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
@@ -66,21 +66,31 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output, *input->clone());
 
-    const bool             is_nhwc                             = input->data_layout() == DataLayout::NHWC;
-    const unsigned int     num_elems_processed_per_iteration_x = is_nhwc ? 4 : max_cl_vector_width / input->element_size();
-    constexpr unsigned int num_elems_processed_per_iteration_y = 2;
+    const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
+    if(is_nhwc)
+    {
+        unsigned int num_elems_processed_per_iteration_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
+        Window       win                                 = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x));
+        Window       win_collapsed                       = win.collapse(win, Window::DimZ);
+        return std::make_pair(Status{}, win_collapsed);
+    }
+    else
+    {
+        const unsigned int     num_elems_processed_per_iteration_x = max_cl_vector_width / input->element_size();
+        constexpr unsigned int num_elems_processed_per_iteration_y = 2;
 
-    // Configure kernel window
-    Window                win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-    AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+        // Configure kernel window
+        Window                win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
 
-    const bool window_changed = update_window_and_padding(win, input_access, output_access);
+        const bool window_changed = update_window_and_padding(win, input_access, output_access);
 
-    Window win_collapsed = win.collapse(win, Window::DimZ);
+        Window win_collapsed = win.collapse(win, Window::DimZ);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win_collapsed);
+        Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+        return std::make_pair(err, win_collapsed);
+    }
 }
 } // namespace
 
@@ -97,24 +107,35 @@ void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *o
 void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
+    auto padding_info = get_padding_info({ input, output });
 
     _input  = input;
     _output = output;
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
-
-    const DataLayout   data_layout = input->info()->data_layout();
-    const bool         is_nhwc     = data_layout == DataLayout::NHWC;
-    const unsigned int channels    = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
-    const unsigned int vec_size    = is_nhwc ? 4 : max_cl_vector_width / input->info()->element_size();
+    const DataLayout   data_layout          = input->info()->data_layout();
+    const bool         is_nhwc              = data_layout == DataLayout::NHWC;
+    const unsigned int channels             = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+    unsigned int       vec_size_x           = 0;
+    unsigned int       vec_size_x_leftovers = 0;
+    if(is_nhwc)
+    {
+        vec_size_x           = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+        vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
+    }
+    else
+    {
+        vec_size_x = max_cl_vector_width / input->info()->element_size();
+    }
 
     // Set kernel build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
     build_opts.add_option("-DK=" + support::cpp11::to_string(channels / num_groups));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option_if(is_nhwc, "-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftovers));
+    build_opts.add_option_if(is_nhwc, "-DSRC_DIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
     build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
-    build_opts.add_option("-DLAST_ACCESSED=" + support::cpp11::to_string(std::max(static_cast<int>(channels - vec_size), 0)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
 
     // Create kernel
@@ -145,6 +166,10 @@ void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_cont
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(2));
+    if(data_layout == DataLayout::NHWC)
+    {
+        ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+    }
 }
 
 Status CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
-- 
cgit v1.2.1