From 4112eed70d110376674609af92e76c68ae8b3a39 Mon Sep 17 00:00:00 2001
From: Giorgio Arena <giorgio.arena@arm.com>
Date: Fri, 23 Oct 2020 14:24:26 +0100
Subject: COMPMID-3731 Remove OpenCL padding: CLHeightConcatenateLayerKernel

Signed-off-by: Giorgio Arena <giorgio.arena@arm.com>
Change-Id: I004128fdcc1207c25d2b959f17f04f9e1a8b4cb5
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4247
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
---
 .../CL/kernels/CLHeightConcatenateLayerKernel.h    |  1 -
 arm_compute/core/Utils.h                           | 18 +++++++----
 src/core/CL/cl_kernels/concatenate.cl              | 23 +++++++-------
 .../CL/kernels/CLHeightConcatenateLayerKernel.cpp  | 35 ++++++++--------------
 src/core/Utils.cpp                                 | 27 +++++++++++++----
 5 files changed, 59 insertions(+), 45 deletions(-)
diff --git a/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h
index 4fa2b40881..f362441944 100644
--- a/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h
@@ -72,7 +72,6 @@ public:
 
 private:
     unsigned int _height_offset;
-    unsigned int _num_elems_processed_per_iteration;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H */
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index 681a1a708e..1c02e89ab6 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -45,6 +45,7 @@
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Calculate the rounded up quotient of val / m.
  *
@@ -1094,20 +1095,27 @@ std::string string_from_pixel_value(const PixelValue &value, const DataType data
  * @return DataType
  */
 DataType data_type_from_name(const std::string &name);
+/** Stores padding information before configuring a kernel
+ *
+ * @param[in] infos list of tensor infos to store the padding info for
+ *
+ * @return An unordered map where each tensor info pointer is paired with its original padding info
+ */
+std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initializer_list<const ITensorInfo *> infos);
 /** Stores padding information before configuring a kernel
  *
  * @param[in] tensors list of tensors to store the padding info for
  *
- * @return An unordered map where each tensor pointer is paired with its original padding info
+ * @return An unordered map where each tensor info pointer is paired with its original padding info
  */
-std::unordered_map<const ITensor *, PaddingSize> get_padding_info(std::initializer_list<const ITensor *> tensors);
+std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initializer_list<const ITensor *> tensors);
 /** Check if the previously stored padding info has changed after configuring a kernel
  *
- * @param[in] padding_map an unordered map where each tensor pointer is paired with its original padding info
+ * @param[in] padding_map an unordered map where each tensor info pointer is paired with its original padding info
  *
- * @return true if any of the tensors has changed its paddings
+ * @return true if any of the tensor infos has changed its paddings
  */
-bool has_padding_changed(const std::unordered_map<const ITensor *, PaddingSize> &padding_map);
+bool has_padding_changed(const std::unordered_map<const ITensorInfo *, PaddingSize> &padding_map);
 
 /** Input Stream operator for @ref DataType
  *
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
index 0b211a6d1f..0f4b5afe2c 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/concatenate.cl
@@ -330,6 +330,8 @@ __kernel void concatenate_width(
 
 #endif /* defined(WIDTH_OFFSET) && defined(DEPTH) */
 
+#if defined(VEC_SIZE_LEFTOVER)
+
 #if defined(HEIGHT_OFFSET) && defined(DEPTH) && defined(VEC_SIZE)
 /** This kernel concatenates the input tensor into the output tensor along the second dimension
  *
@@ -338,6 +340,7 @@ __kernel void concatenate_width(
  * @note Vector sizes supported are 2,4,8 and 16.
  * @note The offset for the second spatial dimension has to be passed at compile time using -DHEIGHT_OFFSET. i.e. -DHEIGHT_OFFSET=128
  * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -365,26 +368,26 @@ __kernel void concatenate_height(
     TENSOR4D_DECLARATION(src),
     TENSOR4D_DECLARATION(dst))
 {
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, DEPTH);
-    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
+    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + (get_global_id(2) % DEPTH) * src_stride_z + (get_global_id(
+                                   2) / DEPTH) * src_stride_w;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + (get_global_id(2) % DEPTH) * dst_stride_z + (get_global_id(
+                                   2) / DEPTH) * dst_stride_w;
 
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+    source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
 
 #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-    const VEC_QUANT out = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
-    VSTORE(VEC_SIZE)
-    (out, 0, (__global DATA_TYPE *)(dst.ptr + HEIGHT_OFFSET * dst_stride_y));
+    const VEC_QUANT out0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    STORE_VECTOR_SELECT(out, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 #else  /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-    VSTORE(VEC_SIZE)
-    (source_values, 0, (__global DATA_TYPE *)(dst.ptr + HEIGHT_OFFSET * dst_stride_y));
+    STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 #endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
 }
 
 #endif /* defined(HEIGHT_OFFSET) && defined(DEPTH) */
 
-#if defined(VEC_SIZE_LEFTOVER)
-
 /** This kernel concatenates the input tensor into the output tensor along the third dimension
  *
  * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
diff --git a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
index 1ae2599721..3f5e91e5a1 100644
--- a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
@@ -39,20 +39,6 @@ namespace arm_compute
 {
 namespace
 {
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration)
-{
-    num_elems_processed_per_iteration = 4;
-    // The window needs to be based on input as we copy all the heights of input
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    bool                   window_changed = update_window_and_padding(win, input_access, output_access);
-
-    Window win_collapsed = win.collapse(win, Window::DimZ);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win_collapsed);
-}
 Status validate_arguments(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -72,15 +58,13 @@ Status validate_arguments(const ITensorInfo *input, unsigned int height_offset,
 } // namespace
 
 CLHeightConcatenateLayerKernel::CLHeightConcatenateLayerKernel()
-    : _height_offset(0), _num_elems_processed_per_iteration()
+    : _height_offset(0)
 {
 }
 
 Status CLHeightConcatenateLayerKernel::validate(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output)
 {
-    unsigned int num_elems_processed_per_iteration;
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, height_offset, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration).first);
     return Status{};
 }
 
@@ -89,16 +73,19 @@ void CLHeightConcatenateLayerKernel::configure(const CLCompileContext &compile_c
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, height_offset, output));
 
-    _height_offset = height_offset;
+    auto padding_info = get_padding_info({ input, output });
 
-    auto win_config = validate_and_configure_window(input, output, _num_elems_processed_per_iteration);
+    _height_offset = height_offset;
 
     // Add build options
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->dimension(0));
+
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->element_size()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset));
     build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->dimension(2)));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->dimension(0) % num_elems_processed_per_iteration));
 
     if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info())
     {
@@ -115,12 +102,14 @@ void CLHeightConcatenateLayerKernel::configure(const CLCompileContext &compile_c
     _kernel = create_kernel(compile_context, "concatenate_height", build_opts.options());
     // Configure kernel window
 
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-    ICLKernel::configure_internal(std::get<1>(win_config));
+    // The window needs to be based on input as we copy all the heights of input
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
 
     // Set output valid region
     output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 void CLHeightConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index c877e8fd1f..babf1c4b91 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -495,26 +495,41 @@ std::pair<int32_t, int32_t> get_quantized_activation_min_max(ActivationLayerInfo
     return std::make_pair(min_activation, max_activation);
 }
 
-std::unordered_map<const ITensor *, PaddingSize> get_padding_info(std::initializer_list<const ITensor *> tensors)
+std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initializer_list<const ITensor *> tensors)
 {
-    std::unordered_map<const ITensor *, PaddingSize> res;
+    std::unordered_map<const ITensorInfo *, PaddingSize> res;
 
     for(const ITensor *tensor : tensors)
     {
         if(tensor)
         {
-            res.insert({ tensor, tensor->info()->padding() });
+            res.insert({ tensor->info(), tensor->info()->padding() });
         }
     }
 
     return res;
 }
 
-bool has_padding_changed(const std::unordered_map<const ITensor *, PaddingSize> &padding_map)
+std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initializer_list<const ITensorInfo *> infos)
 {
-    return std::find_if(padding_map.begin(), padding_map.end(), [](const std::pair<const ITensor *, PaddingSize> &padding_info)
+    std::unordered_map<const ITensorInfo *, PaddingSize> res;
+
+    for(const ITensorInfo *info : infos)
+    {
+        if(info)
+        {
+            res.insert({ info, info->padding() });
+        }
+    }
+
+    return res;
+}
+
+bool has_padding_changed(const std::unordered_map<const ITensorInfo *, PaddingSize> &padding_map)
+{
+    return std::find_if(padding_map.begin(), padding_map.end(), [](const std::pair<const ITensorInfo *, PaddingSize> &padding_info)
     {
-        return (padding_info.first->info()->padding() != padding_info.second);
+        return (padding_info.first->padding() != padding_info.second);
     })
     != padding_map.end();
 }
-- 
cgit v1.2.1