From 0c58f99495844c6ace629116451dae00e8c27418 Mon Sep 17 00:00:00 2001
From: Manuel Bottini <manuel.bottini@arm.com>
Date: Thu, 3 Dec 2020 16:26:35 +0000
Subject: Remove OpenCL padding CLScaleKernel

Resolves COMPMID-3918

Change-Id: I970b1eaf2ae6f2f5a8cfc318cd1a3dfd3ba36fdb
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4668
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
---
 src/core/CL/cl_kernels/scale.cl           | 51 +++++++++++++++++++------------
 src/core/CL/cl_kernels/scale_quantized.cl | 49 ++++++++++++++++++-----------
 src/core/CL/kernels/CLScaleKernel.cpp     | 22 ++++++-------
 3 files changed, 72 insertions(+), 50 deletions(-)

(limited to 'src/core')

diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/scale.cl
index a01ff89a4f..d4c27e6cf6 100644
--- a/src/core/CL/cl_kernels/scale.cl
+++ b/src/core/CL/cl_kernels/scale.cl
@@ -189,8 +189,8 @@ __kernel void scale_nearest_neighbour_nhwc(
     float new_x = get_global_id(1) * scale_x;
     float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
 #elif SAMPLING_POLICY_CENTER
-    float new_x = (get_global_id(1) + 0.5f) * scale_x;
-    float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y;
+    float       new_x = (get_global_id(1) + 0.5f) * scale_x;
+    float       new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y;
 #else /* SAMPLING_POLICY */
 #error("Unsupported sampling policy");
 #endif /* SAMPLING_POLICY */
@@ -209,6 +209,7 @@ __kernel void scale_nearest_neighbour_nhwc(
  * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
  * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
  * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
+ * @note The value to be used at the edges of the images shoud be given as a preprocessor argument using -DCONSTANT_VALUE=value.
  *
  * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S16/F16/F32.
  * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
@@ -230,6 +231,7 @@ __kernel void scale_nearest_neighbour_nhwc(
  * @param[in]  input_height                      Input image height
  * @param[in]  scale_x                           The scale factor along x dimension
  * @param[in]  scale_y                           The scale factor along y dimension
+ *
  */
 __kernel void scale_bilinear_nhwc(
     TENSOR4D_DECLARATION(in),
@@ -252,27 +254,38 @@ __kernel void scale_bilinear_nhwc(
 #error("Unsupported sampling policy");
 #endif /* SAMPLING_POLICY */
 
-    const float new_xf      = floor(new_x);
-    const float new_yf      = floor(new_y);
-    float       clamped_x   = clamp(new_xf, 0.0f, input_width - 1);
-    float       clamped_x1  = clamp(new_xf + 1, 0.0f, input_width - 1);
-    float       clamped_x_  = clamped_x;
-    float       clamped_x1_ = clamped_x1;
-    const float clamped_y   = clamp(new_yf, 0.0f, input_height - 1);
-    const float clamped_y1  = clamp(new_yf + 1, 0.0f, input_height - 1);
+    const float new_xf     = floor(new_x);
+    const float new_yf     = floor(new_y);
+    const float clamped_x  = clamp(new_xf, 0.0f, input_width - 1);
+    const float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1);
+    const float clamped_y  = clamp(new_yf, 0.0f, input_height - 1);
+    const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1);
 
 #ifndef BORDER_MODE_REPLICATE
-    clamped_x1  = select(clamped_x1, 0.0f - BORDER_SIZE, new_yf + 1 < 0.f || new_yf + 1 > input_height - 1 || new_xf + 1 < 0.f || new_xf + 1 > input_width - 1);
-    clamped_x_  = select(clamped_x_, 0.0f - BORDER_SIZE, new_yf + 1 > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1);
-    clamped_x   = select(clamped_x, 0.0f - BORDER_SIZE, new_yf < 0.f || new_yf > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1);
-    clamped_x1_ = select(clamped_x1_, 0.0f - BORDER_SIZE, new_xf + 1 < 0.f || new_xf + 1 > input_width - 1 || new_yf < 0.f || new_yf > input_height - 1);
+    const bool  check_x = (0.f <= new_xf && new_xf < input_width);
+    const bool  check_x1 = (-1.f <= new_xf && new_xf < input_width - 1);
+    const bool  check_y = (0.f <= new_yf && new_yf < input_height);
+    const bool  check_y1 = (-1.f <= new_yf && new_yf < input_height - 1);
+    const float ins_0   = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
+                                                                                                          (get_global_id(2) / DEPTH_OUT)))),
+                                 check_x && check_y);
+    const float ins_1 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y),
+                                                                                                        (get_global_id(2) / DEPTH_OUT)))),
+                                 check_x1 && check_y);
+    const float ins_2 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1),
+                                                                                                        (get_global_id(2) / DEPTH_OUT)))),
+                                 check_x && check_y1);
+    const float ins_3 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1),
+                                                                                                        (get_global_id(2) / DEPTH_OUT)))),
+                                 check_x1 && check_y1);
+    float4 ins = (float4)(ins_0, ins_1, ins_2, ins_3);
+#else  /* BORDER_MODE_REPLICATE */
+    float4 ins        = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+                                 *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+                                 *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
+                                 *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
 #endif /* BORDER_MODE_REPLICATE */
 
-    float4 ins = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
-                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
-
     const float a  = new_x - new_xf;
     const float b  = 1.f - a;
     const float a1 = new_y - new_yf;
diff --git a/src/core/CL/cl_kernels/scale_quantized.cl b/src/core/CL/cl_kernels/scale_quantized.cl
index 2aa7f185c6..010e4ed57a 100644
--- a/src/core/CL/cl_kernels/scale_quantized.cl
+++ b/src/core/CL/cl_kernels/scale_quantized.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -93,6 +93,7 @@ __kernel void scale_bilinear_quantized_nchw(
  * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
  * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
  * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
+ * @note The value to be used at the edges of the images shoud be given as a preprocessor argument using -DCONSTANT_VALUE=value.
  *
  * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8.
  * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
@@ -114,6 +115,7 @@ __kernel void scale_bilinear_quantized_nchw(
  * @param[in]  input_height                      Input image height
  * @param[in]  scale_x                           The scale factor along x dimension
  * @param[in]  scale_y                           The scale factor along y dimension
+ * @param[in]  constant_border_value             Constant border value to use
  */
 __kernel void scale_bilinear_quantized_nhwc(
     TENSOR4D_DECLARATION(in),
@@ -136,27 +138,38 @@ __kernel void scale_bilinear_quantized_nhwc(
 #error("Unsupported sampling policy");
 #endif /* SAMPLING_POLICY */
 
-    const float new_xf      = floor(new_x);
-    const float new_yf      = floor(new_y);
-    float       clamped_x   = clamp(new_xf, 0.0f, input_width - 1);
-    float       clamped_x1  = clamp(new_xf + 1, 0.0f, input_width - 1);
-    float       clamped_x_  = clamped_x;
-    float       clamped_x1_ = clamped_x1;
-    const float clamped_y   = clamp(new_yf, 0.0f, input_height - 1);
-    const float clamped_y1  = clamp(new_yf + 1, 0.0f, input_height - 1);
+    const float new_xf     = floor(new_x);
+    const float new_yf     = floor(new_y);
+    const float clamped_x  = clamp(new_xf, 0.0f, input_width - 1);
+    const float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1);
+    const float clamped_y  = clamp(new_yf, 0.0f, input_height - 1);
+    const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1);
 
 #ifndef BORDER_MODE_REPLICATE
-    clamped_x1  = select(clamped_x1, 0.0f - BORDER_SIZE, new_yf + 1 < 0.f || new_yf + 1 > input_height - 1 || new_xf + 1 < 0.f || new_xf + 1 > input_width - 1);
-    clamped_x_  = select(clamped_x_, 0.0f - BORDER_SIZE, new_yf + 1 > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1);
-    clamped_x   = select(clamped_x, 0.0f - BORDER_SIZE, new_yf < 0.f || new_yf > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1);
-    clamped_x1_ = select(clamped_x1_, 0.0f - BORDER_SIZE, new_xf + 1 < 0.f || new_xf + 1 > input_width - 1 || new_yf < 0.f || new_yf > input_height - 1);
+    const bool  check_x = (0.f <= new_xf && new_xf < input_width);
+    const bool  check_x1 = (-1.f <= new_xf && new_xf < input_width - 1);
+    const bool  check_y = (0.f <= new_yf && new_yf < input_height);
+    const bool  check_y1 = (-1.f <= new_yf && new_yf < input_height - 1);
+    const int ins_0    = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
+                                                                                                     (get_global_id(2) / DEPTH_OUT)))),
+                                 check_x && check_y);
+    const int ins_1 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y),
+                                                                                                  (get_global_id(2) / DEPTH_OUT)))),
+                                 check_x1 && check_y);
+    const int ins_2 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1),
+                                                                                                  (get_global_id(2) / DEPTH_OUT)))),
+                                 check_x && check_y1);
+    const int ins_3 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1),
+                                                                                                  (get_global_id(2) / DEPTH_OUT)))),
+                                 check_x1 && check_y1);
+    int4 ins = (int4)(ins_0, ins_1, ins_2, ins_3);
+#else  /* BORDER_MODE_REPLICATE */
+    int4 ins          = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+                               *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+                               *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
+                               *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
 #endif /* BORDER_MODE_REPLICATE */
 
-    int4 ins = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                      *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                      *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
-                      *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
-
     const float  a      = new_x - new_xf;
     const float  b      = 1.f - a;
     const float  a1     = new_y - new_yf;
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index 5a7d5830fd..f3d2fa12d5 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp
@@ -120,15 +120,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         break;
         case DataLayout::NHWC:
         {
-            num_elems_processed_per_iteration = 1;
             // Configure kernel window
-            win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-            AccessWindowStatic input_access(input, -border.left, -border.top,
-                                            input->dimension(0) + border.right,
-                                            input->dimension(1) + border.bottom);
-            AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-            window_changed = update_window_and_padding(win, input_access, output_access);
-            output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+            win = calculate_max_window(*output, Steps());
         }
         break;
         default:
@@ -142,14 +135,13 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 
 BorderSize CLScaleKernel::border_size() const
 {
-    return BorderSize(1);
+    return BorderSize(static_cast<size_t>(_data_layout == DataLayout::NCHW));
 }
 
 Status CLScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info)
 {
-    BorderSize border = BorderSize(1);
-
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info));
+    BorderSize border = BorderSize(static_cast<size_t>(input->data_layout() == DataLayout::NCHW));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), info, border).first);
 
     return Status{};
@@ -173,6 +165,7 @@ void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, const S
 void CLScaleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), info));
+    auto padding_info = get_padding_info({ input, output });
 
     _input                = input;
     _output               = output;
@@ -208,6 +201,7 @@ void CLScaleKernel::configure(const CLCompileContext &compile_context, const ICL
     // Create kernel
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, input->info()->data_type()));
     build_opts.add_option("-DBORDER_SIZE=" + support::cpp11::to_string(border.right));
     build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
     build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
@@ -219,7 +213,6 @@ void CLScaleKernel::configure(const CLCompileContext &compile_context, const ICL
         build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale));
         build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset));
     }
-
     std::string interpolation_name = string_from_interpolation_policy(interpolation_policy_to_use);
     std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
     std::string kernel_name = "scale_" + interpolation_name;
@@ -250,13 +243,16 @@ void CLScaleKernel::configure(const CLCompileContext &compile_context, const ICL
     _config_id += support::cpp11::to_string(output->info()->dimension(2));
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(3));
+    if(is_nhwc)
+    {
+        ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+    }
 }
 
 void CLScaleKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
     switch(_data_layout)
     {
         case DataLayout::NCHW:
-- 
cgit v1.2.1