aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/NEScaleKernel.cpp
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2017-08-31 18:12:42 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:35:24 +0000
commit583137cc60580023abfd9d05abf933e7e117e29f (patch)
treeb29ec55c11b65e2882e60c0cf8b592bf25e78b1b /src/core/NEON/kernels/NEScaleKernel.cpp
parent3021edfb5e72ef4cd91dbc754ce6ac55388ebc4e (diff)
downloadComputeLibrary-583137cc60580023abfd9d05abf933e7e117e29f.tar.gz
COMPMID-417: Add support for floats in scale.
Change-Id: I7d714ba13861509080a89817f54e9d32da83e970 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/86026 Reviewed-by: Pablo Tello <pablo.tello@arm.com> Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/NEScaleKernel.cpp')
-rw-r--r--src/core/NEON/kernels/NEScaleKernel.cpp209
1 files changed, 177 insertions, 32 deletions
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 7ec4212227..6634d4b13c 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -50,8 +50,10 @@ BorderSize NEScaleKernel::border_size() const
void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON(output == input);
if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
{
@@ -243,6 +245,50 @@ void NEScaleKernel::scale_nearest(const Window &window)
in, offsets, out);
break;
}
+ case DataType::F32:
+ {
+ float32x4x4_t tmp =
+ {
+ {
+ vdupq_n_f32(0),
+ vdupq_n_f32(0),
+ vdupq_n_f32(0),
+ vdupq_n_f32(0)
+ }
+ };
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+
+ const int in_yi = (id.y() + 0.5f) * hr;
+ const int offset_row = in_yi * input_stride;
+
+ tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
+ tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 1);
+ tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 2);
+ tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 3);
+
+ tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0);
+ tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 1);
+ tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 2);
+ tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 3);
+
+ tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[2], 0);
+ tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[2], 1);
+ tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[2], 2);
+ tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[2], 3);
+
+ tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[3], 0);
+ tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[3], 1);
+ tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[3], 2);
+ tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[3], 3);
+
+ vst4q_f32(reinterpret_cast<float *>(out.ptr()), tmp);
+ },
+ in, offsets, out);
+ break;
+ }
default:
ARM_COMPUTE_ERROR("Not supported");
break;
@@ -251,7 +297,7 @@ void NEScaleKernel::scale_nearest(const Window &window)
void NEScaleKernel::scale_bilinear(const Window &window)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8, DataType::S16, DataType::F32);
// Compute the ratio between source height and destination height
const auto hr = static_cast<float>(_input->info()->dimension(1)) / static_cast<float>(_output->info()->dimension(1));
@@ -278,41 +324,140 @@ void NEScaleKernel::scale_bilinear(const Window &window)
Iterator dy(_dy, win_off);
/* Input image stride */
- const size_t in_stride = _input->info()->strides_in_bytes()[1];
+ const size_t in_stide_in_bytes = _input->info()->strides_in_bytes()[1];
+ const size_t in_stride = in_stide_in_bytes / _input->info()->element_size();
- execute_window_loop(window, [&](const Coordinates & id)
+ switch(_input->info()->data_type())
{
- const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
- const auto dx_ptr = reinterpret_cast<const float *>(dx.ptr());
- const auto dy_ptr = reinterpret_cast<const float *>(dy.ptr());
- const auto in_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
+ case DataType::U8:
+ {
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+ const auto dx_ptr = reinterpret_cast<const float *>(dx.ptr());
+ const auto dy_ptr = reinterpret_cast<const float *>(dy.ptr());
+ const auto in_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
+
+ const int in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f);
+ const int offset_row = in_yi * in_stide_in_bytes;
+
+ uint8x8_t tmp0 = vdup_n_u8(0);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
+ tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
+
+ uint8x8_t tmp1 = vdup_n_u8(0);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
+ tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
+
+ vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
+ },
+ in, offsets, dx, dy, out);
+ break;
+ }
+ case DataType::S16:
+ {
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+ const auto dx_ptr = reinterpret_cast<const float *>(dx.ptr());
+ const auto dy_ptr = reinterpret_cast<const float *>(dy.ptr());
- const int in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f);
- const int offset_row = in_yi * in_stride;
+ const int in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f);
+ const int offset_row = in_yi * in_stide_in_bytes;
- uint8x8_t tmp0 = vdup_n_u8(0);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
- tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
+ int16x8x2_t tmp =
+ {
+ {
+ vdupq_n_s16(0),
+ vdupq_n_s16(0)
+ }
+ };
+
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0);
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[0], 1);
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 2);
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[0], 3);
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 4);
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[0], 5);
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 6);
+ tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[0], 7);
+
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0);
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[1], 1);
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 2);
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[1], 3);
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 4);
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[1], 5);
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 6);
+ tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[1], 7);
- uint8x8_t tmp1 = vdup_n_u8(0);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
- tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
+ vst2q_s16(reinterpret_cast<int16_t *>(out.ptr()), tmp);
+ },
+ in, offsets, dx, dy, out);
+ break;
+ }
+ case DataType::F32:
+ {
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+ const auto dx_ptr = reinterpret_cast<const float *>(dx.ptr());
+ const auto dy_ptr = reinterpret_cast<const float *>(dy.ptr());
- vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
- },
- in, offsets, dx, dy, out);
+ const int in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f);
+ const int offset_row = in_yi * in_stide_in_bytes;
+
+ float32x4x4_t tmp =
+ {
+ {
+ vdupq_n_f32(0),
+ vdupq_n_f32(0),
+ vdupq_n_f32(0),
+ vdupq_n_f32(0)
+ }
+ };
+
+ tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0);
+ tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 1);
+ tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 2);
+ tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 3);
+
+ tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0);
+ tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 1);
+ tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 2);
+ tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 3);
+
+ tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[2], 0);
+ tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[2], 1);
+ tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[2], 2);
+ tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[2], 3);
+
+ tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[3], 0);
+ tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[3], 1);
+ tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[3], 2);
+ tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[3], 3);
+
+ vst4q_f32(reinterpret_cast<float *>(out.ptr()), tmp);
+ },
+ in, offsets, dx, dy, out);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ break;
+ }
}
void NEScaleKernel::scale_area(const Window &window)