From 83be745adba7a9928c03beda65a6a83f14846475 Mon Sep 17 00:00:00 2001
From: Isabella Gottardi <isabella.gottardi@arm.com>
Date: Tue, 29 Aug 2017 13:47:03 +0100
Subject: COMPMID-424 Implemented reference implementation and tests for
 WarpAffine

Change-Id: I4924ab1de17adc3b880a5cc22f2497abbc8e221b
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/85820
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Steven Niu <steven.niu@arm.com>
---
 src/core/CL/cl_kernels/warp_affine.cl      |   4 +-
 src/core/CL/kernels/CLWarpAffineKernel.cpp |   7 +-
 src/core/NEON/kernels/NEWarpKernel.cpp     | 107 ++++++++++++++++++++++++-----
 3 files changed, 98 insertions(+), 20 deletions(-)

(limited to 'src')
diff --git a/src/core/CL/cl_kernels/warp_affine.cl b/src/core/CL/cl_kernels/warp_affine.cl
index 0a4748f452..f41821cdca 100644
--- a/src/core/CL/cl_kernels/warp_affine.cl
+++ b/src/core/CL/cl_kernels/warp_affine.cl
@@ -84,7 +84,7 @@ __kernel void warp_affine_nearest_neighbour(
 {
     Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
     Image out = CONVERT_TO_IMAGE_STRUCT(out);
-    vstore4(read_texels4(&in, convert_int8(clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height))), 0, out.ptr);
+    vstore4(read_texels4(&in, convert_int8_rtn(clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height))), 0, out.ptr);
 }
 
 /** Performs an affine transform on an image interpolating with the BILINEAR method. Input and output are single channel U8.
@@ -116,5 +116,5 @@ __kernel void warp_affine_bilinear(
 {
     Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
     Image out = CONVERT_TO_IMAGE_STRUCT(out);
-    vstore4(bilinear_interpolate(&in, clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height), width, height), 0, out.ptr);
+    vstore4(bilinear_interpolate(&in, apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height), 0, out.ptr);
 }
diff --git a/src/core/CL/kernels/CLWarpAffineKernel.cpp b/src/core/CL/kernels/CLWarpAffineKernel.cpp
index e549dbc258..be095f2c34 100644
--- a/src/core/CL/kernels/CLWarpAffineKernel.cpp
+++ b/src/core/CL/kernels/CLWarpAffineKernel.cpp
@@ -88,8 +88,11 @@ void CLWarpAffineKernel::configure(const ICLTensor *input, ICLTensor *output, co
 
     Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
 
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1));
+    int       total_right  = ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration);
+    const int access_right = total_right + (((total_right - input->info()->dimension(0)) == 0) ? border_size().right : 0);
+
+    AccessWindowStatic     input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
     update_window_and_padding(win, input_access, output_access);
 
diff --git a/src/core/NEON/kernels/NEWarpKernel.cpp b/src/core/NEON/kernels/NEWarpKernel.cpp
index 62f4e5d057..ab8ab14ae5 100644
--- a/src/core/NEON/kernels/NEWarpKernel.cpp
+++ b/src/core/NEON/kernels/NEWarpKernel.cpp
@@ -143,7 +143,11 @@ void NEWarpAffineKernel<interpolation>::warp_undefined(const Window &window)
     const float start_y0 = M10 * window.x().start();
 
     // Current row
-    int y_cur = window.y().start();
+    int y_cur  = window.y().start();
+    int z_cur  = window.z().start();
+    int d3_cur = window[3].start();
+    int d4_cur = window[4].start();
+    int d5_cur = window[5].start();
 
     // const_x0 and const_y0 are the constant parts of x0 and y0 during the row processing
     float const_x0 = M01 * y_cur + M02;
@@ -155,10 +159,14 @@ void NEWarpAffineKernel<interpolation>::warp_undefined(const Window &window)
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        // Check if we are processing a new row. If so, update the current row (y_cur), x0 and y0
-        if(y_cur != id.y())
+        // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
+        if((y_cur != id.y()) || (z_cur != id.z()) || (d3_cur != id[3]) || (d4_cur != id[4]) || (d5_cur != id[5]))
         {
-            y_cur = id.y();
+            y_cur  = id.y();
+            z_cur  = id.z();
+            d3_cur = id[3];
+            d4_cur = id[4];
+            d5_cur = id[5];
 
             const_x0 = M01 * y_cur + M02;
             const_y0 = M11 * y_cur + M12;
@@ -222,7 +230,11 @@ void NEWarpAffineKernel<interpolation>::warp_constant(const Window &window)
     const float start_y0 = M10 * window.x().start();
 
     // Current row
-    int y_cur = window.y().start();
+    int y_cur  = window.y().start();
+    int z_cur  = window.z().start();
+    int d3_cur = window[3].start();
+    int d4_cur = window[4].start();
+    int d5_cur = window[5].start();
 
     // const_x0 and const_y0 are the constant parts of x0 and y0 during the row processing
     float const_x0 = M01 * y_cur + M02;
@@ -234,10 +246,14 @@ void NEWarpAffineKernel<interpolation>::warp_constant(const Window &window)
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        // Check if we are processing a new row. If so, update the current row (y_cur), x0 and y0
-        if(y_cur != id.y())
+        // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
+        if((y_cur != id.y()) || (z_cur != id.z()) || (d3_cur != id[3]) || (d4_cur != id[4]) || (d5_cur != id[5]))
         {
-            y_cur = id.y();
+            y_cur  = id.y();
+            z_cur  = id.z();
+            d3_cur = id[3];
+            d4_cur = id[4];
+            d5_cur = id[5];
 
             const_x0 = M01 * y_cur + M02;
             const_y0 = M11 * y_cur + M12;
@@ -264,7 +280,34 @@ void NEWarpAffineKernel<interpolation>::warp_constant(const Window &window)
         }
         else
         {
-            *out.ptr() = _constant_border_value;
+            switch(interpolation)
+            {
+                case InterpolationPolicy::NEAREST_NEIGHBOR:
+                    *out.ptr() = _constant_border_value;
+                    break;
+                case InterpolationPolicy::BILINEAR:
+                {
+                    const auto xi   = clamp<int>(std::floor(x0), min_x - 1, max_x);
+                    const auto yi   = clamp<int>(std::floor(y0), min_y - 1, max_y);
+                    const auto xi_1 = clamp<int>(std::floor(x0 + 1), min_x - 1, max_x);
+                    const auto yi_1 = clamp<int>(std::floor(y0 + 1), min_y - 1, max_y);
+
+                    const float dx  = x0 - std::floor(x0);
+                    const float dy  = y0 - std::floor(y0);
+                    const float dx1 = 1.0f - dx;
+                    const float dy1 = 1.0f - dy;
+
+                    const float a00 = *(in.ptr() + xi + yi * stride);
+                    const float a01 = *(in.ptr() + xi_1 + yi * stride);
+                    const float a10 = *(in.ptr() + xi + yi_1 * stride);
+                    const float a11 = *(in.ptr() + xi_1 + yi_1 * stride);
+
+                    *out.ptr() = a00 * (dx1 * dy1) + a01 * (dx * dy1) + a10 * (dx1 * dy) + a11 * (dx * dy);
+                }
+                break;
+                default:
+                    ARM_COMPUTE_ERROR("Interpolation not supported");
+            }
         }
 
         x0 += M00;
@@ -292,7 +335,11 @@ void NEWarpAffineKernel<interpolation>::warp_replicate(const Window &window)
     const size_t stride = _input->info()->strides_in_bytes()[1];
 
     // Current row
-    int y_cur = window.y().start();
+    int y_cur  = window.y().start();
+    int z_cur  = window.z().start();
+    int d3_cur = window[3].start();
+    int d4_cur = window[4].start();
+    int d5_cur = window[5].start();
 
     const float M00 = _matrix[0];
     const float M10 = _matrix[1];
@@ -314,10 +361,14 @@ void NEWarpAffineKernel<interpolation>::warp_replicate(const Window &window)
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        // Check if we are processing a new row. If so, update the current row (y_cur), x0 and y0
-        if(y_cur != id.y())
+        // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
+        if((y_cur != id.y()) || (z_cur != id.z()) || (d3_cur != id[3]) || (d4_cur != id[4]) || (d5_cur != id[5]))
         {
-            y_cur = id.y();
+            y_cur  = id.y();
+            z_cur  = id.z();
+            d3_cur = id[3];
+            d4_cur = id[4];
+            d5_cur = id[5];
 
             const_x0 = M01 * y_cur + M02;
             const_y0 = M11 * y_cur + M12;
@@ -345,10 +396,34 @@ void NEWarpAffineKernel<interpolation>::warp_replicate(const Window &window)
         else
         {
             // Clamp coordinates
-            const auto xi = clamp<int>(x0, min_x, max_x - 1);
-            const auto yi = clamp<int>(y0, min_y, max_y - 1);
+            const auto xi = clamp<int>(std::floor(x0), min_x, max_x - 1);
+            const auto yi = clamp<int>(std::floor(y0), min_y, max_y - 1);
+            switch(interpolation)
+            {
+                case InterpolationPolicy::NEAREST_NEIGHBOR:
+                    *out.ptr() = *(in.ptr() + xi + yi * stride);
+                    break;
+                case InterpolationPolicy::BILINEAR:
+                {
+                    const auto xi_1 = clamp<int>(std::floor(x0 + 1), min_x, max_x - 1);
+                    const auto yi_1 = clamp<int>(std::floor(y0 + 1), min_y, max_y - 1);
 
-            *out.ptr() = *(in.ptr() + xi + yi * stride);
+                    const float dx  = x0 - std::floor(x0);
+                    const float dy  = y0 - std::floor(y0);
+                    const float dx1 = 1.0f - dx;
+                    const float dy1 = 1.0f - dy;
+
+                    const float a00 = *(in.ptr() + xi + yi * stride);
+                    const float a01 = *(in.ptr() + xi_1 + yi * stride);
+                    const float a10 = *(in.ptr() + xi + yi_1 * stride);
+                    const float a11 = *(in.ptr() + xi_1 + yi_1 * stride);
+
+                    *out.ptr() = a00 * (dx1 * dy1) + a01 * (dx * dy1) + a10 * (dx1 * dy) + a11 * (dx * dy);
+                }
+                break;
+                default:
+                    ARM_COMPUTE_ERROR("Interpolation not supported");
+            }
         }
 
         x0 += M00;
-- 
cgit v1.2.1