aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2017-08-31 18:12:42 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:35:24 +0000
commit583137cc60580023abfd9d05abf933e7e117e29f (patch)
treeb29ec55c11b65e2882e60c0cf8b592bf25e78b1b /src
parent3021edfb5e72ef4cd91dbc754ce6ac55388ebc4e (diff)
downloadComputeLibrary-583137cc60580023abfd9d05abf933e7e117e29f.tar.gz
COMPMID-417: Add support for floats in scale.
Change-Id: I7d714ba13861509080a89817f54e9d32da83e970 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/86026 Reviewed-by: Pablo Tello <pablo.tello@arm.com> Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/core/CL/kernels/CLFillBorderKernel.cpp1
-rw-r--r--src/core/CL/kernels/CLScaleKernel.cpp5
-rw-r--r--src/core/NEON/kernels/NEFillBorderKernel.cpp21
-rw-r--r--src/core/NEON/kernels/NERemapKernel.cpp32
-rw-r--r--src/core/NEON/kernels/NEScaleKernel.cpp209
-rw-r--r--src/core/NEON/kernels/NEWarpKernel.cpp12
-rw-r--r--src/runtime/CL/functions/CLScale.cpp7
-rw-r--r--src/runtime/NEON/functions/NEScale.cpp2
8 files changed, 214 insertions, 75 deletions
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index d2610539d1..2e066c7753 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -29,6 +29,7 @@
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index 23ce89aba2..66afc3db60 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp
@@ -46,9 +46,10 @@ BorderSize CLScaleKernel::border_size() const
void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, bool border_undefined)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON(output == input);
_input = input;
_output = output;
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 3f1f678a7e..9505a2520f 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -27,16 +27,13 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
#include <algorithm>
#include <cstdint>
-#if ARM_COMPUTE_ENABLE_FP16
-#include <arm_fp16.h> // needed for float16_t
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
-
using namespace arm_compute;
namespace
@@ -163,18 +160,20 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
case DataType::S32:
fill_constant_value_single_channel<int32_t>(window);
break;
-#ifdef ARM_COMPUTE_ENABLE_FP16
case DataType::F16:
- static_assert(sizeof(float16_t) == 2, "Float16_t must be 16 bit");
- fill_constant_value_single_channel<float16_t>(window);
+ static_assert(sizeof(half) == 2, "Float16_t must be 16 bit");
+ fill_constant_value_single_channel<half>(window);
break;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
case DataType::F32:
static_assert(sizeof(float) == 4, "Float must be 32 bit");
if(_border_size.left == 1 && _border_size.top == 1)
+ {
fill_constant_value_single_channel_special<float, 1u, 1u>(_tensor, window, _border_size.right, _border_size.bottom, _constant_border_value);
+ }
else
+ {
fill_constant_value_single_channel<float>(window);
+ }
break;
default:
ARM_COMPUTE_ERROR("Not handled");
@@ -205,12 +204,10 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
case DataType::S32:
fill_replicate_single_channel<int32_t>(window);
break;
-#ifdef ARM_COMPUTE_ENABLE_FP16
case DataType::F16:
- static_assert(sizeof(float16_t) == 2, "Float16_t must be 16 bit");
- fill_replicate_single_channel<float16_t>(window);
+ static_assert(sizeof(half) == 2, "Float16_t must be 16 bit");
+ fill_replicate_single_channel<half>(window);
break;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
case DataType::F32:
static_assert(sizeof(float) == 4, "Float must be 32 bit");
fill_replicate_single_channel<float>(window);
diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
index 2dbabca2e3..83004aedc1 100644
--- a/src/core/NEON/kernels/NERemapKernel.cpp
+++ b/src/core/NEON/kernels/NERemapKernel.cpp
@@ -192,24 +192,24 @@ void NERemapKernel::remap_bilinear(const Window &window)
const uint8_t *in_ptr = in.ptr();
uint8x8_t tmp0 = vdup_n_u8(0);
- tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[0], mapy_ptr[0]), tmp0, 0);
- tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[1], mapy_ptr[1]), tmp0, 1);
- tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[2], mapy_ptr[2]), tmp0, 2);
- tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[3], mapy_ptr[3]), tmp0, 3);
- tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[4], mapy_ptr[4]), tmp0, 4);
- tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[5], mapy_ptr[5]), tmp0, 5);
- tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[6], mapy_ptr[6]), tmp0, 6);
- tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[7], mapy_ptr[7]), tmp0, 7);
+ tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[0], mapy_ptr[0]), tmp0, 0);
+ tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[1], mapy_ptr[1]), tmp0, 1);
+ tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[2], mapy_ptr[2]), tmp0, 2);
+ tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[3], mapy_ptr[3]), tmp0, 3);
+ tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[4], mapy_ptr[4]), tmp0, 4);
+ tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[5], mapy_ptr[5]), tmp0, 5);
+ tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[6], mapy_ptr[6]), tmp0, 6);
+ tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[7], mapy_ptr[7]), tmp0, 7);
uint8x8_t tmp1 = vdup_n_u8(0);
- tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[8], mapy_ptr[8]), tmp1, 0);
- tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[9], mapy_ptr[9]), tmp1, 1);
- tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[10], mapy_ptr[10]), tmp1, 2);
- tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[11], mapy_ptr[11]), tmp1, 3);
- tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[12], mapy_ptr[12]), tmp1, 4);
- tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[13], mapy_ptr[13]), tmp1, 5);
- tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[14], mapy_ptr[14]), tmp1, 6);
- tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[15], mapy_ptr[15]), tmp1, 7);
+ tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[8], mapy_ptr[8]), tmp1, 0);
+ tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[9], mapy_ptr[9]), tmp1, 1);
+ tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[10], mapy_ptr[10]), tmp1, 2);
+ tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[11], mapy_ptr[11]), tmp1, 3);
+ tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[12], mapy_ptr[12]), tmp1, 4);
+ tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[13], mapy_ptr[13]), tmp1, 5);
+ tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[14], mapy_ptr[14]), tmp1, 6);
+ tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[15], mapy_ptr[15]), tmp1, 7);
vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
},
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 7ec4212227..6634d4b13c 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -50,8 +50,10 @@ BorderSize NEScaleKernel::border_size() const
void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON(output == input);
if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
{
@@ -243,6 +245,50 @@ void NEScaleKernel::scale_nearest(const Window &window)
in, offsets, out);
break;
}
+ case DataType::F32:
+ {
+ float32x4x4_t tmp =
+ {
+ {
+ vdupq_n_f32(0),
+ vdupq_n_f32(0),
+ vdupq_n_f32(0),
+ vdupq_n_f32(0)
+ }
+ };
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+
+ const int in_yi = (id.y() + 0.5f) * hr;
+ const int offset_row = in_yi * input_stride;
+
+ tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
+ tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 1);
+ tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 2);
+ tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 3);
+
+ tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0);
+ tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 1);
+ tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 2);
+ tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 3);
+
+ tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[2], 0);
+ tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[2], 1);
+ tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[2], 2);
+ tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[2], 3);
+
+ tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[3], 0);
+ tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[3], 1);
+ tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[3], 2);
+ tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[3], 3);
+
+ vst4q_f32(reinterpret_cast<float *>(out.ptr()), tmp);
+ },
+ in, offsets, out);
+ break;
+ }
default:
ARM_COMPUTE_ERROR("Not supported");
break;
@@ -251,7 +297,7 @@ void NEScaleKernel::scale_nearest(const Window &window)
void NEScaleKernel::scale_bilinear(const Window &window)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8, DataType::S16, DataType::F32);
// Compute the ratio between source height and destination height
const auto hr = static_cast<float>(_input->info()->dimension(1)) / static_cast<float>(_output->info()->dimension(1));
@@ -278,41 +324,140 @@ void NEScaleKernel::scale_bilinear(const Window &window)
Iterator dy(_dy, win_off);
/* Input image stride */
- const size_t in_stride = _input->info()->strides_in_bytes()[1];
+ const size_t in_stide_in_bytes = _input->info()->strides_in_bytes()[1];
+ const size_t in_stride = in_stide_in_bytes / _input->info()->element_size();
- execute_window_loop(window, [&](const Coordinates & id)
+ switch(_input->info()->data_type())
{
- const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
- const auto dx_ptr = reinterpret_cast<const float *>(dx.ptr());
- const auto dy_ptr = reinterpret_cast<const float *>(dy.ptr());
- const auto in_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
+ case DataType::U8:
+ {
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+ const auto dx_ptr = reinterpret_cast<const float *>(dx.ptr());
+ const auto dy_ptr = reinterpret_cast<const float *>(dy.ptr());
+ const auto in_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
+
+ const int in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f);
+ const int offset_row = in_yi * in_stide_in_bytes;
+
+ uint8x8_t tmp0 = vdup_n_u8(0);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
+
+ uint8x8_t tmp1 = vdup_n_u8(0);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
+
+ vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
+ },
+ in, offsets, dx, dy, out);
+ break;
+ }
+ case DataType::S16:
+ {
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+ const auto dx_ptr = reinterpret_cast<const float *>(dx.ptr());
+ const auto dy_ptr = reinterpret_cast<const float *>(dy.ptr());
- const int in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f);
- const int offset_row = in_yi * in_stride;
+ const int in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f);
+ const int offset_row = in_yi * in_stide_in_bytes;
- uint8x8_t tmp0 = vdup_n_u8(0);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
+ int16x8x2_t tmp =
+ {
+ {
+ vdupq_n_s16(0),
+ vdupq_n_s16(0)
+ }
+ };
+
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0);
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[0], 1);
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 2);
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[0], 3);
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 4);
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[0], 5);
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 6);
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[0], 7);
+
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0);
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[1], 1);
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 2);
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[1], 3);
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 4);
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[1], 5);
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 6);
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[1], 7);
- uint8x8_t tmp1 = vdup_n_u8(0);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
+ vst2q_s16(reinterpret_cast<int16_t *>(out.ptr()), tmp);
+ },
+ in, offsets, dx, dy, out);
+ break;
+ }
+ case DataType::F32:
+ {
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+ const auto dx_ptr = reinterpret_cast<const float *>(dx.ptr());
+ const auto dy_ptr = reinterpret_cast<const float *>(dy.ptr());
- vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
- },
- in, offsets, dx, dy, out);
+ const int in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f);
+ const int offset_row = in_yi * in_stide_in_bytes;
+
+ float32x4x4_t tmp =
+ {
+ {
+ vdupq_n_f32(0),
+ vdupq_n_f32(0),
+ vdupq_n_f32(0),
+ vdupq_n_f32(0)
+ }
+ };
+
+ tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0);
+ tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 1);
+ tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 2);
+ tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 3);
+
+ tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0);
+ tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 1);
+ tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 2);
+ tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 3);
+
+ tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[2], 0);
+ tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[2], 1);
+ tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[2], 2);
+ tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[2], 3);
+
+ tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[3], 0);
+ tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[3], 1);
+ tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[3], 2);
+ tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[3], 3);
+
+ vst4q_f32(reinterpret_cast<float *>(out.ptr()), tmp);
+ },
+ in, offsets, dx, dy, out);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ break;
+ }
}
void NEScaleKernel::scale_area(const Window &window)
diff --git a/src/core/NEON/kernels/NEWarpKernel.cpp b/src/core/NEON/kernels/NEWarpKernel.cpp
index b13e99e800..62f4e5d057 100644
--- a/src/core/NEON/kernels/NEWarpKernel.cpp
+++ b/src/core/NEON/kernels/NEWarpKernel.cpp
@@ -177,7 +177,7 @@ void NEWarpAffineKernel<interpolation>::warp_undefined(const Window &window)
*out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
break;
case InterpolationPolicy::BILINEAR:
- *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0);
+ *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0);
break;
default:
ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -256,7 +256,7 @@ void NEWarpAffineKernel<interpolation>::warp_constant(const Window &window)
*out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
break;
case InterpolationPolicy::BILINEAR:
- *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0);
+ *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0);
break;
default:
ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -336,7 +336,7 @@ void NEWarpAffineKernel<interpolation>::warp_replicate(const Window &window)
*out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
break;
case InterpolationPolicy::BILINEAR:
- *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0);
+ *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0);
break;
default:
ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -445,7 +445,7 @@ void NEWarpPerspectiveKernel<interpolation>::warp_undefined(const Window &window
*out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
break;
case InterpolationPolicy::BILINEAR:
- *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn);
+ *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn);
break;
default:
ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -546,7 +546,7 @@ void NEWarpPerspectiveKernel<interpolation>::warp_constant(const Window &window)
*out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
break;
case InterpolationPolicy::BILINEAR:
- *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn);
+ *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn);
break;
default:
ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -678,7 +678,7 @@ void NEWarpPerspectiveKernel<interpolation>::warp_replicate(const Window &window
*out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
break;
case InterpolationPolicy::BILINEAR:
- *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn);
+ *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn);
break;
default:
ARM_COMPUTE_ERROR("Interpolation not supported");
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index 05522b4af5..49b0275019 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -31,13 +31,8 @@
using namespace arm_compute;
-void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
{
- ARM_COMPUTE_ERROR_ON(output == input);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
auto k = arm_compute::support::cpp14::make_unique<CLScaleKernel>();
k->configure(input, output, policy, border_mode == BorderMode::UNDEFINED);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index 0b17e80ee7..bbd3fac63f 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -95,7 +95,7 @@ NEScale::NEScale() // NOLINT
{
}
-void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
{
ARM_COMPUTE_ERROR_ON(nullptr == input);
ARM_COMPUTE_ERROR_ON(nullptr == output);