From f3c52596a87eb4ccada601fc57f612bab137415d Mon Sep 17 00:00:00 2001 From: Manuel Bottini Date: Wed, 12 May 2021 15:39:07 +0100 Subject: Remove padding from NERemapKernel Use of out_of_tensor function to check if parallel instructons can be used safely Reverting to serial computation otherwise Resolves: COMPMID-4449 Change-Id: I23a986612e3c5d0367e23e56f1aeedbb1330cffc Signed-off-by: Manuel Bottini Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5651 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas --- src/core/NEON/kernels/NERemapKernel.cpp | 307 ++++++++++++++++++++------------ src/core/NEON/kernels/NERemapKernel.h | 27 +-- 2 files changed, 213 insertions(+), 121 deletions(-) (limited to 'src/core/NEON') diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp index 24d0dd82e8..a1ba29e4c4 100644 --- a/src/core/NEON/kernels/NERemapKernel.cpp +++ b/src/core/NEON/kernels/NERemapKernel.cpp @@ -38,55 +38,100 @@ #include #include -using namespace arm_compute; +using namespace arm_compute::scale_helpers; namespace arm_compute { class Coordinates; -} // namespace arm_compute namespace { -inline int32x4_t offset_nearest_interpolation(const float *mapx_ptr, const float *mapy_ptr, const float32x4_t &width, const float32x4_t &height, const int32x4_t &stride) +inline int32_t num_out_of_tensor(const float *mapx_ptr, const float *mapy_ptr, const int32x4_t &width_1, const int32x4_t &height_1) { - const float32x4_t lowerxy = vdupq_n_f32(-1.f); - - float32x4_t x = vld1q_f32(mapx_ptr); - float32x4_t y = vld1q_f32(mapy_ptr); - - // Clamp x coordinates - x = vmaxq_f32(lowerxy, vminq_f32(x, width)); - y = vmaxq_f32(lowerxy, vminq_f32(y, height)); - - const int32x4_t x_s32 = vcvtq_s32_f32(x); - const int32x4_t y_s32 = vcvtq_s32_f32(y); + const int32x4_t mapx_s32 = vcvtq_s32_f32(vld1q_f32(mapx_ptr)); + const int32x4_t mapy_s32 = vcvtq_s32_f32(vld1q_f32(mapy_ptr)); + + const int32x4_t outbx_s32 = vminq_s32(vmaxq_s32(vminq_s32(vsubq_s32(width_1, mapx_s32), mapx_s32), vdupq_n_s32(-1)), vdupq_n_s32(0)); // Contains -1 if out of border in x, 0 otherwise + const int32x4_t outby_s32 = vminq_s32(vmaxq_s32(vminq_s32(vsubq_s32(height_1, mapy_s32), mapy_s32), vdupq_n_s32(-1)), vdupq_n_s32(0)); // Contains -1 if out of border in y, 0 otherwise + + const int32x4_t out_of_tensor_v = vminq_s32(outbx_s32, outby_s32); +#if defined(__aarch64__) + // only AArch64 supports vaddv + return vaddvq_s32(out_of_tensor_v); +#else // __aarch64__ + return vgetq_lane_s32(out_of_tensor_v, 0) + vgetq_lane_s32(out_of_tensor_v, 1) + vgetq_lane_s32(out_of_tensor_v, 2) + vgetq_lane_s32(out_of_tensor_v, 3); +#endif // __aarch64__ +} - return vmlaq_s32(x_s32, y_s32, stride); +inline void serial_remap_nearest_interpolation(const uint8_t *in_ptr, const float *mapx_ptr, const float *mapy_ptr, uint8_t *out_ptr, + int32_t width_val, int32_t height_val, int32_t in_stride_val, uint8_t constant_border_value) +{ + const auto x_s32 = static_cast(*mapx_ptr); + const auto y_s32 = static_cast(*mapy_ptr); + if(x_s32 < 0 || y_s32 < 0 || x_s32 >= width_val || y_s32 >= height_val) + { + *(out_ptr) = constant_border_value; + } + else + { + *(out_ptr) = in_ptr[x_s32 + y_s32 * in_stride_val]; + } } -} // namespace +inline int32x4_t offset_nearest_interpolation(const float *mapx_ptr, const float *mapy_ptr, const int32x4_t &stride) +{ + const int32x4_t mapx_s32 = vcvtq_s32_f32(vld1q_f32(mapx_ptr)); + const int32x4_t mapy_s32 = vcvtq_s32_f32(vld1q_f32(mapy_ptr)); + return vmlaq_s32(mapx_s32, mapy_s32, stride); +} -NERemapKernel::NERemapKernel() - : _func(nullptr), _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr) +inline uint8_t pixel_bilinear_c1_clamp(const uint8_t *pixel_ptr, int32_t stride, int32_t width, int32_t height, float x, float y, uint8_t constant_border_value) { + x = std::max(-1.f, std::min(x, static_cast(width))); + y = std::max(-1.f, std::min(y, static_cast(height))); + + const int32_t xi = static_cast(std::floor(x)); + const int32_t yi = static_cast(std::floor(y)); + + const float dx = x - static_cast(xi); + const float dy = y - static_cast(yi); + + // Calculating the address won't trigger a segfault in case the value is outside the tensor + // The ternary operator resolves the values in both conditions + const uint8_t *a00 = (xi < 0 || xi >= width || yi < 0 || yi >= height) ? &constant_border_value : (pixel_ptr + xi + yi * stride); + const uint8_t *a01 = (xi + 1 >= width || yi < 0 || yi >= height) ? &constant_border_value : (pixel_ptr + xi + 1 + yi * stride); + const uint8_t *a10 = (xi < 0 || xi >= width || yi + 1 >= height) ? &constant_border_value : (pixel_ptr + xi + yi * stride + stride); + const uint8_t *a11 = (xi + 1 >= width || yi + 1 >= height) ? &constant_border_value : (pixel_ptr + xi + 1 + yi * stride + stride); + + const float dx1 = 1.0f - dx; + const float dy1 = 1.0f - dy; + const float w1 = dx1 * dy1; + const float w2 = dx * dy1; + const float w3 = dx1 * dy; + const float w4 = dx * dy; + + return static_cast((*a00) * w1 + (*a01) * w2 + (*a10) * w3 + (*a11) * w4); } +} // namespace -BorderSize NERemapKernel::border_size() const +NERemapKernel::NERemapKernel() + : _func(nullptr), _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr), _border_mode(BorderMode::UNDEFINED), _constant_border_value(0) { - return BorderSize(1); } -void NERemapKernel::configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy) +void NERemapKernel::configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32); - _input = input; - _output = output; - _map_x = map_x; - _map_y = map_y; + _input = input; + _output = output; + _map_x = map_x; + _map_y = map_y; + _border_mode = border_mode; + _constant_border_value = constant_border_value; switch(policy) { @@ -105,24 +150,8 @@ void NERemapKernel::configure(const ITensor *input, const ITensor *map_x, const break; } - constexpr unsigned int num_elems_processed_per_iteration = 16; - // Configure kernel window - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); - - const int total_right = ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration); - const int access_right = total_right + (((total_right - input->info()->dimension(0)) == 0) ? border_size().right : 0); - - AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom); - - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal mapx_access(map_x->info(), 0, num_elems_processed_per_iteration); - AccessWindowHorizontal mapy_access(map_y->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, input_access, mapx_access, mapy_access, output_access); - - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); - + Window win = calculate_max_window(*output->info(), Steps()); INEKernel::configure(win); } @@ -134,94 +163,153 @@ void NERemapKernel::remap_nearest(const Window &window) win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - Iterator in(_input, win_in); - Iterator out(_output, window); - Iterator mapx(_map_x, window); - Iterator mapy(_map_y, window); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const int32_t window_step_x = 8; - const float32x4_t width = vdupq_n_f32(static_cast(_input->info()->dimension(0))); - const float32x4_t height = vdupq_n_f32(static_cast(_input->info()->dimension(1))); - const int32x4_t in_stride = vdupq_n_s32(static_cast(_input->info()->strides_in_bytes()[1])); + // Don't increment in X direction for the output, mapx, mapy tensors + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); - execute_window_loop(window, [&](const Coordinates &) + Iterator in(_input, win_in); + Iterator out(_output, win); + Iterator mapx(_map_x, win); + Iterator mapy(_map_y, win); + + const int32_t width_val = static_cast(_input->info()->dimension(0)); + const int32_t height_val = static_cast(_input->info()->dimension(1)); + const int32_t in_stride_val = static_cast(_input->info()->strides_in_bytes()[1]); + const int32x4_t width_1 = vdupq_n_s32(width_val - 1); + const int32x4_t height_1 = vdupq_n_s32(height_val - 1); + const int32x4_t in_stride = vdupq_n_s32(in_stride_val); + + execute_window_loop(win, [&](const Coordinates &) { - const auto mapx_ptr = reinterpret_cast(mapx.ptr()); - const auto mapy_ptr = reinterpret_cast(mapy.ptr()); + auto mapx_ptr = reinterpret_cast(mapx.ptr()); + auto mapy_ptr = reinterpret_cast(mapy.ptr()); const uint8_t *in_ptr = in.ptr(); - - const int32x4_t offset0 = offset_nearest_interpolation(mapx_ptr + 0, mapy_ptr + 0, width, height, in_stride); - const int32x4_t offset1 = offset_nearest_interpolation(mapx_ptr + 4, mapy_ptr + 4, width, height, in_stride); - const int32x4_t offset2 = offset_nearest_interpolation(mapx_ptr + 8, mapy_ptr + 8, width, height, in_stride); - const int32x4_t offset3 = offset_nearest_interpolation(mapx_ptr + 12, mapy_ptr + 12, width, height, in_stride); - - uint8x16_t tmp = vdupq_n_u8(0); - tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 0)], tmp, 0); - tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 1)], tmp, 1); - tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 2)], tmp, 2); - tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 3)], tmp, 3); - tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 0)], tmp, 4); - tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 1)], tmp, 5); - tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 2)], tmp, 6); - tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 3)], tmp, 7); - tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 0)], tmp, 8); - tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 1)], tmp, 9); - tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 2)], tmp, 10); - tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 3)], tmp, 11); - tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 0)], tmp, 12); - tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 1)], tmp, 13); - tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 2)], tmp, 14); - tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 3)], tmp, 15); - vst1q_u8(out.ptr(), tmp); + uint8_t *out_ptr = out.ptr(); + int32_t x = window_start_x; + for(; x < window_end_x - window_step_x; x += window_step_x, mapx_ptr += window_step_x, mapy_ptr += window_step_x, out_ptr += window_step_x) + { + const int32_t out_of_tensor0 = num_out_of_tensor(mapx_ptr, mapy_ptr + 0, width_1, height_1); + const int32_t out_of_tensor1 = num_out_of_tensor(mapx_ptr + 4, mapy_ptr + 4, width_1, height_1); + const int32_t out_of_tensor = out_of_tensor0 + out_of_tensor1; + + if(out_of_tensor == -8) + { + // All elements are out of xy plane + uint8x8_t tmp = vdup_n_u8(_constant_border_value); + vst1_u8(out_ptr, tmp); + } + else if(out_of_tensor < 0) + { + // Some elements are out of xy plane + serial_remap_nearest_interpolation(in_ptr, mapx_ptr, mapy_ptr, out_ptr, width_val, height_val, in_stride_val, _constant_border_value); + serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 1, mapy_ptr + 1, out_ptr + 1, width_val, height_val, in_stride_val, _constant_border_value); + serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 2, mapy_ptr + 2, out_ptr + 2, width_val, height_val, in_stride_val, _constant_border_value); + serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 3, mapy_ptr + 3, out_ptr + 3, width_val, height_val, in_stride_val, _constant_border_value); + serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 4, mapy_ptr + 4, out_ptr + 4, width_val, height_val, in_stride_val, _constant_border_value); + serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 5, mapy_ptr + 5, out_ptr + 5, width_val, height_val, in_stride_val, _constant_border_value); + serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 6, mapy_ptr + 6, out_ptr + 6, width_val, height_val, in_stride_val, _constant_border_value); + serial_remap_nearest_interpolation(in_ptr, mapx_ptr + 7, mapy_ptr + 7, out_ptr + 7, width_val, height_val, in_stride_val, _constant_border_value); + } + else + { + // All elements are in xy plane + uint8x8_t tmp = vdup_n_u8(0); + const int32x4_t offset0 = offset_nearest_interpolation(mapx_ptr, mapy_ptr, in_stride); + const int32x4_t offset1 = offset_nearest_interpolation(mapx_ptr + 4, mapy_ptr + 4, in_stride); + tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 0)], tmp, 0); + tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 1)], tmp, 1); + tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 2)], tmp, 2); + tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 3)], tmp, 3); + tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 0)], tmp, 4); + tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 1)], tmp, 5); + tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 2)], tmp, 6); + tmp = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 3)], tmp, 7); + vst1_u8(out_ptr, tmp); + } + } + for(; x < window_end_x; ++x, ++mapx_ptr, ++mapy_ptr, ++out_ptr) + { + serial_remap_nearest_interpolation(in_ptr, mapx_ptr, mapy_ptr, out_ptr, width_val, height_val, in_stride_val, _constant_border_value); + } }, in, out, mapx, mapy); } void NERemapKernel::remap_bilinear(const Window &window) { - using namespace scale_helpers; - // Don't increment in X and Y direction for the input tensor // A pointer to the start of this plane is needed as base for the precomputed offsets Window win_in(window); win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const int32_t window_step_x = 8; + + // Don't increment in X direction for the output, mapx, mapy tensors + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator in(_input, win_in); - Iterator out(_output, window); - Iterator mapx(_map_x, window); - Iterator mapy(_map_y, window); + Iterator out(_output, win); + Iterator mapx(_map_x, win); + Iterator mapy(_map_y, win); - const size_t width = _input->info()->dimension(0); - const size_t height = _input->info()->dimension(1); - const size_t in_stride = _input->info()->strides_in_bytes()[1]; + const int32_t width_val = static_cast(_input->info()->dimension(0)); + const int32_t height_val = static_cast(_input->info()->dimension(1)); + const int32x4_t width_2 = vdupq_n_s32(width_val - 2); + const int32x4_t height_2 = vdupq_n_s32(height_val - 2); + const int32_t in_stride_val = static_cast(_input->info()->strides_in_bytes()[1]); - execute_window_loop(window, [&](const Coordinates &) + execute_window_loop(win, [&](const Coordinates &) { - const auto mapx_ptr = reinterpret_cast(mapx.ptr()); - const auto mapy_ptr = reinterpret_cast(mapy.ptr()); + auto mapx_ptr = reinterpret_cast(mapx.ptr()); + auto mapy_ptr = reinterpret_cast(mapy.ptr()); const uint8_t *in_ptr = in.ptr(); - - uint8x8_t tmp0 = vdup_n_u8(0); - tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[0], mapy_ptr[0]), tmp0, 0); - tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[1], mapy_ptr[1]), tmp0, 1); - tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[2], mapy_ptr[2]), tmp0, 2); - tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[3], mapy_ptr[3]), tmp0, 3); - tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[4], mapy_ptr[4]), tmp0, 4); - tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[5], mapy_ptr[5]), tmp0, 5); - tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[6], mapy_ptr[6]), tmp0, 6); - tmp0 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[7], mapy_ptr[7]), tmp0, 7); - - uint8x8_t tmp1 = vdup_n_u8(0); - tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[8], mapy_ptr[8]), tmp1, 0); - tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[9], mapy_ptr[9]), tmp1, 1); - tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[10], mapy_ptr[10]), tmp1, 2); - tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[11], mapy_ptr[11]), tmp1, 3); - tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[12], mapy_ptr[12]), tmp1, 4); - tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[13], mapy_ptr[13]), tmp1, 5); - tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[14], mapy_ptr[14]), tmp1, 6); - tmp1 = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[15], mapy_ptr[15]), tmp1, 7); - - vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1)); + uint8_t *out_ptr = out.ptr(); + int32_t x = window_start_x; + for(; x < window_end_x - window_step_x; x += window_step_x, mapx_ptr += window_step_x, mapy_ptr += window_step_x, out_ptr += window_step_x) + { + const int32_t out_of_tensor0 = num_out_of_tensor(mapx_ptr, mapy_ptr + 0, width_2, height_2); + const int32_t out_of_tensor1 = num_out_of_tensor(mapx_ptr + 4, mapy_ptr + 4, width_2, height_2); + const int32_t out_of_tensor = out_of_tensor0 + out_of_tensor1; + + if(out_of_tensor < 0) + { + // Elements are out of xy plane + *(out_ptr) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[0], mapy_ptr[0], _constant_border_value); + *(out_ptr + 1) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[1], mapy_ptr[1], _constant_border_value); + *(out_ptr + 2) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[2], mapy_ptr[2], _constant_border_value); + *(out_ptr + 3) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[3], mapy_ptr[3], _constant_border_value); + *(out_ptr + 4) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[4], mapy_ptr[4], _constant_border_value); + *(out_ptr + 5) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[5], mapy_ptr[5], _constant_border_value); + *(out_ptr + 6) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[6], mapy_ptr[6], _constant_border_value); + *(out_ptr + 7) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[7], mapy_ptr[7], _constant_border_value); + } + else + { + // All elements are in xy plane + uint8x8_t tmp = vdup_n_u8(0); + tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[0], mapy_ptr[0], _constant_border_value), tmp, 0); + tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[1], mapy_ptr[1], _constant_border_value), tmp, 1); + tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[2], mapy_ptr[2], _constant_border_value), tmp, 2); + tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[3], mapy_ptr[3], _constant_border_value), tmp, 3); + tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[4], mapy_ptr[4], _constant_border_value), tmp, 4); + tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[5], mapy_ptr[5], _constant_border_value), tmp, 5); + tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[6], mapy_ptr[6], _constant_border_value), tmp, 6); + tmp = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[7], mapy_ptr[7], _constant_border_value), tmp, 7); + vst1_u8(out_ptr, tmp); + } + } + for(; x < window_end_x; ++x, ++mapx_ptr, ++mapy_ptr, ++out_ptr) + { + *(out_ptr) = pixel_bilinear_c1_clamp(in_ptr, in_stride_val, width_val, height_val, mapx_ptr[0], mapy_ptr[0], _constant_border_value); + } }, in, out, mapx, mapy); } @@ -235,3 +323,4 @@ void NERemapKernel::run(const Window &window, const ThreadInfo &info) (this->*_func)(window); } +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/NEON/kernels/NERemapKernel.h b/src/core/NEON/kernels/NERemapKernel.h index 21cb67ef58..33e929805a 100644 --- a/src/core/NEON/kernels/NERemapKernel.h +++ b/src/core/NEON/kernels/NERemapKernel.h @@ -54,17 +54,18 @@ public: /** Initialize the kernel's input, output and border mode. * - * @param[in] input Source tensor. Data type supported: U8. - * @param[in] map_x Map for X coordinates. Data type supported: F32. - * @param[in] map_y Map for Y coordinates. Data type supported: F32. - * @param[out] output Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane. - * @param[in] policy The interpolation type. + * @param[in] input Source tensor. Data type supported: U8. + * @param[in] map_x Map for X coordinates. Data type supported: F32. + * @param[in] map_y Map for Y coordinates. Data type supported: F32. + * @param[out] output Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane. + * @param[in] policy The interpolation type. + * @param[in] border_mode Border mode to use on the input tensor. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. Defaults to 0. */ - void configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy); + void configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; - BorderSize border_size() const override; private: /** function to perform nearest interpolation on the given window */ @@ -74,10 +75,12 @@ private: /** Remap function to use for the particular interpolation type passed to configure() */ void (NERemapKernel::*_func)(const Window &window); - const ITensor *_input; /**< Input image */ - ITensor *_output; /**< Output image */ - const ITensor *_map_x; /**< Input remap x coordinates */ - const ITensor *_map_y; /**< Input remap y coordinates */ + const ITensor *_input; /**< Input image */ + ITensor *_output; /**< Output image */ + const ITensor *_map_x; /**< Input remap x coordinates */ + const ITensor *_map_y; /**< Input remap y coordinates */ + BorderMode _border_mode; /**< Border mode */ + uint8_t _constant_border_value; /**< Border value to use */ }; } // namespace arm_compute -#endif /*ARM_COMPUTE_NEREMAPKERNEL_H */ +#endif /*ARM_COMPUTE_NEREMAPKERNEL_H */ \ No newline at end of file -- cgit v1.2.1