COMPMID-3539: Change indexing for nearest neighbor with aligned corners

For nearest neighbor interpolation policy with aligned corners all of NEON, CL and reference use round() rather than float to find the nearest integer. Change-Id: If0360da870e983303bf0424ca1100084084c1efc Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3495 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Sang-Hoon Park <sang-hoon.park@arm.com> 2020-07-02 10:49:39 +0100
committer: Georgios Pinitas <georgios.pinitas@arm.com> 2020-07-03 17:39:20 +0000
commit: 94d5051bc56a59d857f8a09560e2da5c0d7894b0 (patch)
tree: 577d9ce10f73608a71c033b495b54457e6c5b1ab
parent: 2aad21a900a21f467b3ec6b37420f892f0d80221 (diff)
download: ComputeLibrary-94d5051bc56a59d857f8a09560e2da5c0d7894b0.tar.gz
6 files changed, 46 insertions, 36 deletions
diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/scale.cl
index 499f9ea53f..971087021f 100644
--- a/src/core/CL/cl_kernels/scale.cl
+++ b/src/core/CL/cl_kernels/scale.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -100,10 +100,14 @@ __kernel void scale_nearest_neighbour_nchw(
     const float scale_x,
     const float scale_y)
 {
-    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
-    const float2 r   = (float2)(scale_x, scale_y);
-    const float8 tc  = clamp_to_border_with_size(transform_nearest(get_current_coords(), r), input_width, input_height, BORDER_SIZE);
+    Image        in          = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image        out         = CONVERT_TO_IMAGE_STRUCT(out);
+    const float2 r           = (float2)(scale_x, scale_y);
+    float8       transformed = transform_nearest(get_current_coords(), r);
+#ifdef ALIGN_CORNERS
+    transformed = round(transformed);
+#endif // ALIGN_CORNERS
+    const float8 tc = clamp_to_border_with_size(transformed, input_width, input_height, BORDER_SIZE);
     vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr);
 }
 
@@ -182,14 +186,18 @@ __kernel void scale_nearest_neighbour_nhwc(
     Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
 
 #ifdef SAMPLING_POLICY_TOP_LEFT
-    const float new_x = get_global_id(1) * scale_x;
-    const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
+    float new_x = get_global_id(1) * scale_x;
+    float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
 #elif SAMPLING_POLICY_CENTER
-    const float new_x = (get_global_id(1) + 0.5f) * scale_x;
-    const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y;
+    float new_x = (get_global_id(1) + 0.5f) * scale_x;
+    float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y;
 #else /* SAMPLING_POLICY */
 #error("Unsupported sampling policy");
 #endif /* SAMPLING_POLICY */
+#ifdef ALIGN_CORNERS
+    new_x = round(new_x);
+    new_y = round(new_y);
+#endif /* ALIGN_CORNERS */
     const float clamped_x = clamp(new_x, 0.0f, input_width - 1);
     const float clamped_y = clamp(new_y, 0.0f, input_height - 1);
 
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index f3acc3b31c..be8b6fc1c1 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp
@@ -211,6 +211,7 @@ void CLScaleKernel::configure(const CLCompileContext &compile_context, const ICL
     build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
     build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
     build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
+    build_opts.add_option_if(_align_corners, "-DALIGN_CORNERS");
     if(call_quantized_kernel)
     {
         const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 0f329a1c2c..38a0706c12 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/Rounding.h"
 #include "arm_compute/core/utils/misc/Utility.h"
 
 #include "src/core/utils/ScaleUtils.h"
@@ -167,7 +168,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 
 template <typename T>
 inline void scale_nearest_nhwc_core(const ITensor *input, const ITensor *offsets, ITensor *output,
-                                    float hr, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c, float sampling_offset)
+                                    float hr, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c, float sampling_offset, bool align_corners)
 {
     const int  window_step_x  = 16 / sizeof(T);
     const auto window_start_x = static_cast<int32_t>(window.x().start());
@@ -183,7 +184,7 @@ inline void scale_nearest_nhwc_core(const ITensor *input, const ITensor *offsets
     execute_window_loop(window, [&](const Coordinates & id)
     {
         const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-        const int     in_yi      = std::floor((id.z() + sampling_offset) * hr);
+        const auto    in_yi      = static_cast<int>(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
         const int     offset_row = in_yi * stride_h;
         int32_t       x          = window_start_x;
         for(; x < window_end_x - window_step_x; x += window_step_x)
@@ -460,8 +461,8 @@ void NEScaleKernel::scale_nearest_nchw(const Window &window)
                 const auto           offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
                 const uint8_t *const in_ptr      = in.ptr();
 
-                const int in_yi         = std::floor((id.y() + _sampling_offset) * hr);
-                const int in_yi_clamped = std::min(static_cast<int>(_input->info()->dimension(1)), std::max(in_yi, -1));
+                const auto in_yi         = static_cast<int>(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
+                const int  in_yi_clamped = std::min(static_cast<int>(_input->info()->dimension(1)), std::max(in_yi, -1));
                 ARM_COMPUTE_ERROR_ON(in_yi_clamped < -1 || in_yi_clamped > static_cast<int>(_input->info()->dimension(1)));
                 const int offset_row = in_yi_clamped * input_stride;
 
@@ -497,8 +498,8 @@ void NEScaleKernel::scale_nearest_nchw(const Window &window)
                 const auto           offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
                 const uint8_t *const in_ptr      = in.ptr();
 
-                const int in_yi         = std::floor((id.y() + _sampling_offset) * hr);
-                const int in_yi_clamped = std::min(static_cast<int>(_input->info()->dimension(1)), std::max(in_yi, -1));
+                const auto in_yi         = static_cast<int>(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
+                const int  in_yi_clamped = std::min(static_cast<int>(_input->info()->dimension(1)), std::max(in_yi, -1));
                 ARM_COMPUTE_ERROR_ON(in_yi_clamped < -1 || in_yi_clamped > static_cast<int>(_input->info()->dimension(1)));
                 const int offset_row = in_yi_clamped * input_stride;
 
@@ -537,9 +538,8 @@ void NEScaleKernel::scale_nearest_nchw(const Window &window)
             execute_window_loop(window, [&](const Coordinates & id)
             {
                 const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-
-                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr);
-                const int offset_row = in_yi * input_stride;
+                const auto in_yi       = static_cast<int>(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
+                const int  offset_row  = in_yi * input_stride;
 
                 tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
                 tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[0], 1);
@@ -578,9 +578,8 @@ void NEScaleKernel::scale_nearest_nchw(const Window &window)
             execute_window_loop(window, [&](const Coordinates & id)
             {
                 const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-
-                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr);
-                const int offset_row = in_yi * input_stride;
+                const auto in_yi       = static_cast<int>(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
+                const int  offset_row  = in_yi * input_stride;
 
                 tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
                 tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[0], 1);
@@ -621,9 +620,8 @@ void NEScaleKernel::scale_nearest_nchw(const Window &window)
             execute_window_loop(window, [&](const Coordinates & id)
             {
                 const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-
-                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr);
-                const int offset_row = in_yi * input_stride;
+                const auto in_yi       = static_cast<int>(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
+                const int  offset_row  = in_yi * input_stride;
 
                 tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
                 tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 1);
@@ -1024,7 +1022,7 @@ void NEScaleKernel::scale_nhwc(const Window &window)
         {
             if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
             {
-                scale_nearest_nhwc_core<int8_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset);
+                scale_nearest_nhwc_core<int8_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners);
             }
             else
             {
@@ -1038,7 +1036,7 @@ void NEScaleKernel::scale_nhwc(const Window &window)
         {
             if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
             {
-                scale_nearest_nhwc_core<uint8_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset);
+                scale_nearest_nhwc_core<uint8_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners);
             }
             else
             {
@@ -1051,7 +1049,7 @@ void NEScaleKernel::scale_nhwc(const Window &window)
         {
             if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
             {
-                scale_nearest_nhwc_core<int16_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset);
+                scale_nearest_nhwc_core<int16_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners);
             }
             else
             {
@@ -1066,7 +1064,7 @@ void NEScaleKernel::scale_nhwc(const Window &window)
             if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
             {
                 scale_nearest_nhwc_core<float16_t>(_input, _offsets, _output, hr,
-                                                   window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset);
+                                                   window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners);
             }
             else
             {
@@ -1080,7 +1078,7 @@ void NEScaleKernel::scale_nhwc(const Window &window)
         {
             if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
             {
-                scale_nearest_nhwc_core<float>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset);
+                scale_nearest_nhwc_core<float>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners);
             }
             else
             {
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index 28fbab4303..eefdfdbaa7 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/Rounding.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 
@@ -43,7 +44,7 @@ namespace arm_compute
 {
 namespace
 {
-void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size, SamplingPolicy sampling_policy)
+void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size, SamplingPolicy sampling_policy, bool align_corners)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == offsets);
     ARM_COMPUTE_UNUSED(sampling_policy);
@@ -84,7 +85,8 @@ void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float
 
         execute_window_loop(win, [&](const Coordinates & id)
         {
-            const size_t in_xi = std::floor((id.x() + sampling_offset) * wr);
+            const float float_in_xi = (id.x() + sampling_offset) * wr;
+            const auto  in_xi       = static_cast<size_t>(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) : std::floor(float_in_xi));
 
             *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size;
         },
@@ -143,7 +145,7 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo &
             _offsets.allocator()->allocate();
 
             // Pre-compute offsets for nearest interpolation
-            precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_element_size, info.sampling_policy);
+            precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_element_size, info.sampling_policy, is_align_corners_used);
             break;
         }
         case InterpolationPolicy::BILINEAR:
@@ -163,7 +165,7 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo &
             _dy.allocator()->allocate();
 
             // Pre-compute dx, dy and offsets for bilinear interpolation
-            precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_element_size, info.sampling_policy);
+            precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_element_size, info.sampling_policy, is_align_corners_used);
             break;
         }
         case InterpolationPolicy::AREA:
diff --git a/tests/validation/GLES_COMPUTE/Scale.cpp b/tests/validation/GLES_COMPUTE/Scale.cpp
index 7a88159481..62b30da7a2 100644
--- a/tests/validation/GLES_COMPUTE/Scale.cpp
+++ b/tests/validation/GLES_COMPUTE/Scale.cpp
@@ -52,7 +52,7 @@ const auto ScaleDataTypes = framework::dataset::make("DataType",
     DataType::F16,
 });
 
-/** Align corners, this functionality is supported only by NEON */
+/** Aligned corners, this functionality is supported only by NEON and OpenCL backends */
 const auto AlignCorners = framework::dataset::make("AlignCorners",
 {
     false,
diff --git a/tests/validation/reference/Scale.cpp b/tests/validation/reference/Scale.cpp
index 44beabb2d4..72f96d446d 100644
--- a/tests/validation/reference/Scale.cpp
+++ b/tests/validation/reference/Scale.cpp
@@ -25,6 +25,7 @@
 #include "Scale.h"
 
 #include "Utils.h"
+#include "arm_compute/core/utils/misc/Rounding.h"
 #include "arm_compute/core/utils/misc/Utility.h"
 #include "src/core/utils/ScaleUtils.h"
 
@@ -81,8 +82,8 @@ SimpleTensor<T> scale_core(const SimpleTensor<T> &in, float scale_x, float scale
                 switch(sampling_policy)
                 {
                     case SamplingPolicy::TOP_LEFT:
-                        x_src = std::floor(idx * wr);
-                        y_src = std::floor(idy * hr);
+                        x_src = align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(idx * wr) : std::floor(idx * wr);
+                        y_src = align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(idy * hr) : std::floor(idy * hr);
                         break;
                     case SamplingPolicy::CENTER:
                         //Calculate the source coords without -0.5f is equivalent to round the x_scr/y_src coords
author	Sang-Hoon Park <sang-hoon.park@arm.com>	2020-07-02 10:49:39 +0100
committer	Georgios Pinitas <georgios.pinitas@arm.com>	2020-07-03 17:39:20 +0000
commit	94d5051bc56a59d857f8a09560e2da5c0d7894b0 (patch)
tree	577d9ce10f73608a71c033b495b54457e6c5b1ab
parent	2aad21a900a21f467b3ec6b37420f892f0d80221 (diff)
download	ComputeLibrary-94d5051bc56a59d857f8a09560e2da5c0d7894b0.tar.gz