From 583137cc60580023abfd9d05abf933e7e117e29f Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Thu, 31 Aug 2017 18:12:42 +0100 Subject: COMPMID-417: Add support for floats in scale. Change-Id: I7d714ba13861509080a89817f54e9d32da83e970 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/86026 Reviewed-by: Pablo Tello Tested-by: Kaizen --- src/core/CL/kernels/CLFillBorderKernel.cpp | 1 + src/core/CL/kernels/CLScaleKernel.cpp | 5 +- src/core/NEON/kernels/NEFillBorderKernel.cpp | 21 ++- src/core/NEON/kernels/NERemapKernel.cpp | 32 ++-- src/core/NEON/kernels/NEScaleKernel.cpp | 209 +++++++++++++++++++++++---- src/core/NEON/kernels/NEWarpKernel.cpp | 12 +- src/runtime/CL/functions/CLScale.cpp | 7 +- src/runtime/NEON/functions/NEScale.cpp | 2 +- 8 files changed, 214 insertions(+), 75 deletions(-) (limited to 'src') diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp index d2610539d1..2e066c7753 100644 --- a/src/core/CL/kernels/CLFillBorderKernel.cpp +++ b/src/core/CL/kernels/CLFillBorderKernel.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp index 23ce89aba2..66afc3db60 100644 --- a/src/core/CL/kernels/CLScaleKernel.cpp +++ b/src/core/CL/kernels/CLScaleKernel.cpp @@ -46,9 +46,10 @@ BorderSize CLScaleKernel::border_size() const void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, bool border_undefined) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_NULLPTR(output); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(output == input); _input = input; _output = output; diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp index 3f1f678a7e..9505a2520f 100644 --- a/src/core/NEON/kernels/NEFillBorderKernel.cpp +++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp @@ -27,16 +27,13 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include #include -#if ARM_COMPUTE_ENABLE_FP16 -#include // needed for float16_t -#endif /* ARM_COMPUTE_ENABLE_FP16 */ - using namespace arm_compute; namespace @@ -163,18 +160,20 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info) case DataType::S32: fill_constant_value_single_channel(window); break; -#ifdef ARM_COMPUTE_ENABLE_FP16 case DataType::F16: - static_assert(sizeof(float16_t) == 2, "Float16_t must be 16 bit"); - fill_constant_value_single_channel(window); + static_assert(sizeof(half) == 2, "Float16_t must be 16 bit"); + fill_constant_value_single_channel(window); break; -#endif /* ARM_COMPUTE_ENABLE_FP16 */ case DataType::F32: static_assert(sizeof(float) == 4, "Float must be 32 bit"); if(_border_size.left == 1 && _border_size.top == 1) + { fill_constant_value_single_channel_special(_tensor, window, _border_size.right, _border_size.bottom, _constant_border_value); + } else + { fill_constant_value_single_channel(window); + } break; default: ARM_COMPUTE_ERROR("Not handled"); @@ -205,12 +204,10 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info) case DataType::S32: fill_replicate_single_channel(window); break; -#ifdef ARM_COMPUTE_ENABLE_FP16 case DataType::F16: - static_assert(sizeof(float16_t) == 2, "Float16_t must be 16 bit"); - fill_replicate_single_channel(window); + static_assert(sizeof(half) == 2, "Float16_t must be 16 bit"); + fill_replicate_single_channel(window); break; -#endif /* ARM_COMPUTE_ENABLE_FP16 */ case DataType::F32: static_assert(sizeof(float) == 4, "Float must be 32 bit"); fill_replicate_single_channel(window); diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp index 2dbabca2e3..83004aedc1 100644 --- a/src/core/NEON/kernels/NERemapKernel.cpp +++ b/src/core/NEON/kernels/NERemapKernel.cpp @@ -192,24 +192,24 @@ void NERemapKernel::remap_bilinear(const Window &window) const uint8_t *in_ptr = in.ptr(); uint8x8_t tmp0 = vdup_n_u8(0); - tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[0], mapy_ptr[0]), tmp0, 0); - tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[1], mapy_ptr[1]), tmp0, 1); - tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[2], mapy_ptr[2]), tmp0, 2); - tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[3], mapy_ptr[3]), tmp0, 3); - tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[4], mapy_ptr[4]), tmp0, 4); - tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[5], mapy_ptr[5]), tmp0, 5); - tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[6], mapy_ptr[6]), tmp0, 6); - tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[7], mapy_ptr[7]), tmp0, 7); + tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[0], mapy_ptr[0]), tmp0, 0); + tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[1], mapy_ptr[1]), tmp0, 1); + tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[2], mapy_ptr[2]), tmp0, 2); + tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[3], mapy_ptr[3]), tmp0, 3); + tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[4], mapy_ptr[4]), tmp0, 4); + tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[5], mapy_ptr[5]), tmp0, 5); + tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[6], mapy_ptr[6]), tmp0, 6); + tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[7], mapy_ptr[7]), tmp0, 7); uint8x8_t tmp1 = vdup_n_u8(0); - tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[8], mapy_ptr[8]), tmp1, 0); - tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[9], mapy_ptr[9]), tmp1, 1); - tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[10], mapy_ptr[10]), tmp1, 2); - tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[11], mapy_ptr[11]), tmp1, 3); - tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[12], mapy_ptr[12]), tmp1, 4); - tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[13], mapy_ptr[13]), tmp1, 5); - tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[14], mapy_ptr[14]), tmp1, 6); - tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[15], mapy_ptr[15]), tmp1, 7); + tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[8], mapy_ptr[8]), tmp1, 0); + tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[9], mapy_ptr[9]), tmp1, 1); + tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[10], mapy_ptr[10]), tmp1, 2); + tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[11], mapy_ptr[11]), tmp1, 3); + tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[12], mapy_ptr[12]), tmp1, 4); + tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[13], mapy_ptr[13]), tmp1, 5); + tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[14], mapy_ptr[14]), tmp1, 6); + tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[15], mapy_ptr[15]), tmp1, 7); vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1)); }, diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp index 7ec4212227..6634d4b13c 100644 --- a/src/core/NEON/kernels/NEScaleKernel.cpp +++ b/src/core/NEON/kernels/NEScaleKernel.cpp @@ -50,8 +50,10 @@ BorderSize NEScaleKernel::border_size() const void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32); + ARM_COMPUTE_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(output == input); if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) { @@ -243,6 +245,50 @@ void NEScaleKernel::scale_nearest(const Window &window) in, offsets, out); break; } + case DataType::F32: + { + float32x4x4_t tmp = + { + { + vdupq_n_f32(0), + vdupq_n_f32(0), + vdupq_n_f32(0), + vdupq_n_f32(0) + } + }; + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offsets_ptr = reinterpret_cast(offsets.ptr()); + + const int in_yi = (id.y() + 0.5f) * hr; + const int offset_row = in_yi * input_stride; + + tmp.val[0] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0); + tmp.val[0] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 1); + tmp.val[0] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 2); + tmp.val[0] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 3); + + tmp.val[1] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0); + tmp.val[1] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 1); + tmp.val[1] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 2); + tmp.val[1] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 3); + + tmp.val[2] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[2], 0); + tmp.val[2] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[2], 1); + tmp.val[2] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[2], 2); + tmp.val[2] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[2], 3); + + tmp.val[3] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[3], 0); + tmp.val[3] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[3], 1); + tmp.val[3] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[3], 2); + tmp.val[3] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[3], 3); + + vst4q_f32(reinterpret_cast(out.ptr()), tmp); + }, + in, offsets, out); + break; + } default: ARM_COMPUTE_ERROR("Not supported"); break; @@ -251,7 +297,7 @@ void NEScaleKernel::scale_nearest(const Window &window) void NEScaleKernel::scale_bilinear(const Window &window) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8, DataType::S16, DataType::F32); // Compute the ratio between source height and destination height const auto hr = static_cast(_input->info()->dimension(1)) / static_cast(_output->info()->dimension(1)); @@ -278,41 +324,140 @@ void NEScaleKernel::scale_bilinear(const Window &window) Iterator dy(_dy, win_off); /* Input image stride */ - const size_t in_stride = _input->info()->strides_in_bytes()[1]; + const size_t in_stide_in_bytes = _input->info()->strides_in_bytes()[1]; + const size_t in_stride = in_stide_in_bytes / _input->info()->element_size(); - execute_window_loop(window, [&](const Coordinates & id) + switch(_input->info()->data_type()) { - const auto offsets_ptr = reinterpret_cast(offsets.ptr()); - const auto dx_ptr = reinterpret_cast(dx.ptr()); - const auto dy_ptr = reinterpret_cast(dy.ptr()); - const auto in_ptr = reinterpret_cast(in.ptr()); + case DataType::U8: + { + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offsets_ptr = reinterpret_cast(offsets.ptr()); + const auto dx_ptr = reinterpret_cast(dx.ptr()); + const auto dy_ptr = reinterpret_cast(dy.ptr()); + const auto in_ptr = reinterpret_cast(in.ptr()); + + const int in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f); + const int offset_row = in_yi * in_stide_in_bytes; + + uint8x8_t tmp0 = vdup_n_u8(0); + tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0); + tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1); + tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2); + tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3); + tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4); + tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5); + tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6); + tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7); + + uint8x8_t tmp1 = vdup_n_u8(0); + tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0); + tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1); + tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2); + tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3); + tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4); + tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5); + tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6); + tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7); + + vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1)); + }, + in, offsets, dx, dy, out); + break; + } + case DataType::S16: + { + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offsets_ptr = reinterpret_cast(offsets.ptr()); + const auto dx_ptr = reinterpret_cast(dx.ptr()); + const auto dy_ptr = reinterpret_cast(dy.ptr()); - const int in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f); - const int offset_row = in_yi * in_stride; + const int in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f); + const int offset_row = in_yi * in_stide_in_bytes; - uint8x8_t tmp0 = vdup_n_u8(0); - tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0); - tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1); - tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2); - tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3); - tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4); - tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5); - tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6); - tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7); + int16x8x2_t tmp = + { + { + vdupq_n_s16(0), + vdupq_n_s16(0) + } + }; + + tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0); + tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[0], 1); + tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 2); + tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[0], 3); + tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 4); + tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[0], 5); + tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 6); + tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[0], 7); + + tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0); + tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[1], 1); + tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 2); + tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[1], 3); + tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 4); + tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[1], 5); + tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 6); + tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[1], 7); - uint8x8_t tmp1 = vdup_n_u8(0); - tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0); - tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1); - tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2); - tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3); - tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4); - tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5); - tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6); - tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7); + vst2q_s16(reinterpret_cast(out.ptr()), tmp); + }, + in, offsets, dx, dy, out); + break; + } + case DataType::F32: + { + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offsets_ptr = reinterpret_cast(offsets.ptr()); + const auto dx_ptr = reinterpret_cast(dx.ptr()); + const auto dy_ptr = reinterpret_cast(dy.ptr()); - vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1)); - }, - in, offsets, dx, dy, out); + const int in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f); + const int offset_row = in_yi * in_stide_in_bytes; + + float32x4x4_t tmp = + { + { + vdupq_n_f32(0), + vdupq_n_f32(0), + vdupq_n_f32(0), + vdupq_n_f32(0) + } + }; + + tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0); + tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 1); + tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 2); + tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 3); + + tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0); + tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 1); + tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 2); + tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 3); + + tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[2], 0); + tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[2], 1); + tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[2], 2); + tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[2], 3); + + tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[3], 0); + tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[3], 1); + tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[3], 2); + tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[3], 3); + + vst4q_f32(reinterpret_cast(out.ptr()), tmp); + }, + in, offsets, dx, dy, out); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } } void NEScaleKernel::scale_area(const Window &window) diff --git a/src/core/NEON/kernels/NEWarpKernel.cpp b/src/core/NEON/kernels/NEWarpKernel.cpp index b13e99e800..62f4e5d057 100644 --- a/src/core/NEON/kernels/NEWarpKernel.cpp +++ b/src/core/NEON/kernels/NEWarpKernel.cpp @@ -177,7 +177,7 @@ void NEWarpAffineKernel::warp_undefined(const Window &window) *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride); break; case InterpolationPolicy::BILINEAR: - *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0); + *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0); break; default: ARM_COMPUTE_ERROR("Interpolation not supported"); @@ -256,7 +256,7 @@ void NEWarpAffineKernel::warp_constant(const Window &window) *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride); break; case InterpolationPolicy::BILINEAR: - *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0); + *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0); break; default: ARM_COMPUTE_ERROR("Interpolation not supported"); @@ -336,7 +336,7 @@ void NEWarpAffineKernel::warp_replicate(const Window &window) *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride); break; case InterpolationPolicy::BILINEAR: - *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0); + *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0); break; default: ARM_COMPUTE_ERROR("Interpolation not supported"); @@ -445,7 +445,7 @@ void NEWarpPerspectiveKernel::warp_undefined(const Window &window *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride); break; case InterpolationPolicy::BILINEAR: - *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn); + *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn); break; default: ARM_COMPUTE_ERROR("Interpolation not supported"); @@ -546,7 +546,7 @@ void NEWarpPerspectiveKernel::warp_constant(const Window &window) *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride); break; case InterpolationPolicy::BILINEAR: - *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn); + *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn); break; default: ARM_COMPUTE_ERROR("Interpolation not supported"); @@ -678,7 +678,7 @@ void NEWarpPerspectiveKernel::warp_replicate(const Window &window *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride); break; case InterpolationPolicy::BILINEAR: - *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn); + *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn); break; default: ARM_COMPUTE_ERROR("Interpolation not supported"); diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp index 05522b4af5..49b0275019 100644 --- a/src/runtime/CL/functions/CLScale.cpp +++ b/src/runtime/CL/functions/CLScale.cpp @@ -31,13 +31,8 @@ using namespace arm_compute; -void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value) +void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value) { - ARM_COMPUTE_ERROR_ON(output == input); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - auto k = arm_compute::support::cpp14::make_unique(); k->configure(input, output, policy, border_mode == BorderMode::UNDEFINED); _kernel = std::move(k); diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp index 0b17e80ee7..bbd3fac63f 100644 --- a/src/runtime/NEON/functions/NEScale.cpp +++ b/src/runtime/NEON/functions/NEScale.cpp @@ -95,7 +95,7 @@ NEScale::NEScale() // NOLINT { } -void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value) +void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value) { ARM_COMPUTE_ERROR_ON(nullptr == input); ARM_COMPUTE_ERROR_ON(nullptr == output); -- cgit v1.2.1