COMPMID-417: Add support for floats in scale.

Change-Id: I7d714ba13861509080a89817f54e9d32da83e970 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/86026 Reviewed-by: Pablo Tello <pablo.tello@arm.com> Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
author: Georgios Pinitas <georgios.pinitas@arm.com> 2017-08-31 18:12:42 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:35:24 +0000
commit: 583137cc60580023abfd9d05abf933e7e117e29f (patch)
tree: b29ec55c11b65e2882e60c0cf8b592bf25e78b1b /src
parent: 3021edfb5e72ef4cd91dbc754ce6ac55388ebc4e (diff)
download: ComputeLibrary-583137cc60580023abfd9d05abf933e7e117e29f.tar.gz
8 files changed, 214 insertions, 75 deletions
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index d2610539d1..2e066c7753 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index 23ce89aba2..66afc3db60 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp
@@ -46,9 +46,10 @@ BorderSize CLScaleKernel::border_size() const
 
 void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, bool border_undefined)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(output == input);
 
     _input  = input;
     _output = output;
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 3f1f678a7e..9505a2520f 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -27,16 +27,13 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
 #include <algorithm>
 #include <cstdint>
 
-#if ARM_COMPUTE_ENABLE_FP16
-#include <arm_fp16.h> // needed for float16_t
-#endif                /* ARM_COMPUTE_ENABLE_FP16 */
-
 using namespace arm_compute;
 
 namespace
@@ -163,18 +160,20 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
                 case DataType::S32:
                     fill_constant_value_single_channel<int32_t>(window);
                     break;
-#ifdef ARM_COMPUTE_ENABLE_FP16
                 case DataType::F16:
-                    static_assert(sizeof(float16_t) == 2, "Float16_t must be 16 bit");
-                    fill_constant_value_single_channel<float16_t>(window);
+                    static_assert(sizeof(half) == 2, "Float16_t must be 16 bit");
+                    fill_constant_value_single_channel<half>(window);
                     break;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
                 case DataType::F32:
                     static_assert(sizeof(float) == 4, "Float must be 32 bit");
                     if(_border_size.left == 1 && _border_size.top == 1)
+                    {
                         fill_constant_value_single_channel_special<float, 1u, 1u>(_tensor, window, _border_size.right, _border_size.bottom, _constant_border_value);
+                    }
                     else
+                    {
                         fill_constant_value_single_channel<float>(window);
+                    }
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Not handled");
@@ -205,12 +204,10 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
                 case DataType::S32:
                     fill_replicate_single_channel<int32_t>(window);
                     break;
-#ifdef ARM_COMPUTE_ENABLE_FP16
                 case DataType::F16:
-                    static_assert(sizeof(float16_t) == 2, "Float16_t must be 16 bit");
-                    fill_replicate_single_channel<float16_t>(window);
+                    static_assert(sizeof(half) == 2, "Float16_t must be 16 bit");
+                    fill_replicate_single_channel<half>(window);
                     break;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
                 case DataType::F32:
                     static_assert(sizeof(float) == 4, "Float must be 32 bit");
                     fill_replicate_single_channel<float>(window);
diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
index 2dbabca2e3..83004aedc1 100644
--- a/src/core/NEON/kernels/NERemapKernel.cpp
+++ b/src/core/NEON/kernels/NERemapKernel.cpp
@@ -192,24 +192,24 @@ void NERemapKernel::remap_bilinear(const Window &window)
         const uint8_t *in_ptr   = in.ptr();
 
         uint8x8_t tmp0 = vdup_n_u8(0);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[0], mapy_ptr[0]), tmp0, 0);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[1], mapy_ptr[1]), tmp0, 1);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[2], mapy_ptr[2]), tmp0, 2);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[3], mapy_ptr[3]), tmp0, 3);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[4], mapy_ptr[4]), tmp0, 4);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[5], mapy_ptr[5]), tmp0, 5);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[6], mapy_ptr[6]), tmp0, 6);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[7], mapy_ptr[7]), tmp0, 7);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[0], mapy_ptr[0]), tmp0, 0);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[1], mapy_ptr[1]), tmp0, 1);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[2], mapy_ptr[2]), tmp0, 2);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[3], mapy_ptr[3]), tmp0, 3);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[4], mapy_ptr[4]), tmp0, 4);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[5], mapy_ptr[5]), tmp0, 5);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[6], mapy_ptr[6]), tmp0, 6);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[7], mapy_ptr[7]), tmp0, 7);
 
         uint8x8_t tmp1 = vdup_n_u8(0);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[8], mapy_ptr[8]), tmp1, 0);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[9], mapy_ptr[9]), tmp1, 1);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[10], mapy_ptr[10]), tmp1, 2);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[11], mapy_ptr[11]), tmp1, 3);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[12], mapy_ptr[12]), tmp1, 4);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[13], mapy_ptr[13]), tmp1, 5);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[14], mapy_ptr[14]), tmp1, 6);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[15], mapy_ptr[15]), tmp1, 7);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[8], mapy_ptr[8]), tmp1, 0);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[9], mapy_ptr[9]), tmp1, 1);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[10], mapy_ptr[10]), tmp1, 2);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[11], mapy_ptr[11]), tmp1, 3);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[12], mapy_ptr[12]), tmp1, 4);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[13], mapy_ptr[13]), tmp1, 5);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[14], mapy_ptr[14]), tmp1, 6);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[15], mapy_ptr[15]), tmp1, 7);
 
         vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
     },
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 7ec4212227..6634d4b13c 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -50,8 +50,10 @@ BorderSize NEScaleKernel::border_size() const
 
 void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(output == input);
 
     if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
@@ -243,6 +245,50 @@ void NEScaleKernel::scale_nearest(const Window &window)
             in, offsets, out);
             break;
         }
+        case DataType::F32:
+        {
+            float32x4x4_t tmp =
+            {
+                {
+                    vdupq_n_f32(0),
+                    vdupq_n_f32(0),
+                    vdupq_n_f32(0),
+                    vdupq_n_f32(0)
+                }
+            };
+
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+
+                const int in_yi      = (id.y() + 0.5f) * hr;
+                const int offset_row = in_yi * input_stride;
+
+                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
+                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 1);
+                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 2);
+                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 3);
+
+                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0);
+                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 1);
+                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 2);
+                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 3);
+
+                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[2], 0);
+                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[2], 1);
+                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[2], 2);
+                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[2], 3);
+
+                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[3], 0);
+                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[3], 1);
+                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[3], 2);
+                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[3], 3);
+
+                vst4q_f32(reinterpret_cast<float *>(out.ptr()), tmp);
+            },
+            in, offsets, out);
+            break;
+        }
         default:
             ARM_COMPUTE_ERROR("Not supported");
             break;
@@ -251,7 +297,7 @@ void NEScaleKernel::scale_nearest(const Window &window)
 
 void NEScaleKernel::scale_bilinear(const Window &window)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8, DataType::S16, DataType::F32);
 
     // Compute the ratio between source height and destination height
     const auto hr = static_cast<float>(_input->info()->dimension(1)) / static_cast<float>(_output->info()->dimension(1));
@@ -278,41 +324,140 @@ void NEScaleKernel::scale_bilinear(const Window &window)
     Iterator dy(_dy, win_off);
 
     /* Input image stride */
-    const size_t in_stride = _input->info()->strides_in_bytes()[1];
+    const size_t in_stide_in_bytes = _input->info()->strides_in_bytes()[1];
+    const size_t in_stride         = in_stide_in_bytes / _input->info()->element_size();
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    switch(_input->info()->data_type())
     {
-        const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-        const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
-        const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
-        const auto in_ptr      = reinterpret_cast<const uint8_t *>(in.ptr());
+        case DataType::U8:
+        {
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
+                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
+                const auto in_ptr      = reinterpret_cast<const uint8_t *>(in.ptr());
+
+                const int in_yi      = std::floor((id.y() + 0.5f) * hr - 0.5f);
+                const int offset_row = in_yi * in_stide_in_bytes;
+
+                uint8x8_t tmp0 = vdup_n_u8(0);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
+
+                uint8x8_t tmp1 = vdup_n_u8(0);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
+
+                vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
+            },
+            in, offsets, dx, dy, out);
+            break;
+        }
+        case DataType::S16:
+        {
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
+                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
 
-        const int in_yi      = std::floor((id.y() + 0.5f) * hr - 0.5f);
-        const int offset_row = in_yi * in_stride;
+                const int in_yi      = std::floor((id.y() + 0.5f) * hr - 0.5f);
+                const int offset_row = in_yi * in_stide_in_bytes;
 
-        uint8x8_t tmp0 = vdup_n_u8(0);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
+                int16x8x2_t tmp =
+                {
+                    {
+                        vdupq_n_s16(0),
+                        vdupq_n_s16(0)
+                    }
+                };
+
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0);
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[0], 1);
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 2);
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[0], 3);
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 4);
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[0], 5);
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 6);
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[0], 7);
+
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0);
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[1], 1);
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 2);
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[1], 3);
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 4);
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[1], 5);
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 6);
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[1], 7);
 
-        uint8x8_t tmp1 = vdup_n_u8(0);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
+                vst2q_s16(reinterpret_cast<int16_t *>(out.ptr()), tmp);
+            },
+            in, offsets, dx, dy, out);
+            break;
+        }
+        case DataType::F32:
+        {
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
+                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
 
-        vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
-    },
-    in, offsets, dx, dy, out);
+                const int in_yi      = std::floor((id.y() + 0.5f) * hr - 0.5f);
+                const int offset_row = in_yi * in_stide_in_bytes;
+
+                float32x4x4_t tmp =
+                {
+                    {
+                        vdupq_n_f32(0),
+                        vdupq_n_f32(0),
+                        vdupq_n_f32(0),
+                        vdupq_n_f32(0)
+                    }
+                };
+
+                tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0);
+                tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 1);
+                tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 2);
+                tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 3);
+
+                tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0);
+                tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 1);
+                tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 2);
+                tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 3);
+
+                tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[2], 0);
+                tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[2], 1);
+                tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[2], 2);
+                tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[2], 3);
+
+                tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[3], 0);
+                tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[3], 1);
+                tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[3], 2);
+                tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[3], 3);
+
+                vst4q_f32(reinterpret_cast<float *>(out.ptr()), tmp);
+            },
+            in, offsets, dx, dy, out);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
 }
 
 void NEScaleKernel::scale_area(const Window &window)
diff --git a/src/core/NEON/kernels/NEWarpKernel.cpp b/src/core/NEON/kernels/NEWarpKernel.cpp
index b13e99e800..62f4e5d057 100644
--- a/src/core/NEON/kernels/NEWarpKernel.cpp
+++ b/src/core/NEON/kernels/NEWarpKernel.cpp
@@ -177,7 +177,7 @@ void NEWarpAffineKernel<interpolation>::warp_undefined(const Window &window)
                     *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0);
+                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -256,7 +256,7 @@ void NEWarpAffineKernel<interpolation>::warp_constant(const Window &window)
                     *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0);
+                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -336,7 +336,7 @@ void NEWarpAffineKernel<interpolation>::warp_replicate(const Window &window)
                     *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0);
+                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -445,7 +445,7 @@ void NEWarpPerspectiveKernel<interpolation>::warp_undefined(const Window &window
                     *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn);
+                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -546,7 +546,7 @@ void NEWarpPerspectiveKernel<interpolation>::warp_constant(const Window &window)
                     *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn);
+                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -678,7 +678,7 @@ void NEWarpPerspectiveKernel<interpolation>::warp_replicate(const Window &window
                     *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn);
+                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index 05522b4af5..49b0275019 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -31,13 +31,8 @@
 
 using namespace arm_compute;
 
-void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
 {
-    ARM_COMPUTE_ERROR_ON(output == input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
     auto k = arm_compute::support::cpp14::make_unique<CLScaleKernel>();
     k->configure(input, output, policy, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index 0b17e80ee7..bbd3fac63f 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -95,7 +95,7 @@ NEScale::NEScale() // NOLINT
 {
 }
 
-void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == input);
     ARM_COMPUTE_ERROR_ON(nullptr == output);
author	Georgios Pinitas <georgios.pinitas@arm.com>	2017-08-31 18:12:42 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:35:24 +0000
commit	583137cc60580023abfd9d05abf933e7e117e29f (patch)
tree	b29ec55c11b65e2882e60c0cf8b592bf25e78b1b /src
parent	3021edfb5e72ef4cd91dbc754ce6ac55388ebc4e (diff)
download	ComputeLibrary-583137cc60580023abfd9d05abf933e7e117e29f.tar.gz