From fc2f6d0427e1d886fcccc68867d1af1ccd96608b Mon Sep 17 00:00:00 2001 From: Manuel Bottini Date: Wed, 26 Aug 2020 16:28:38 +0100 Subject: COMPMID-3145: Remove padding from NEScaleKernel Change-Id: I530b12c6270d7dbeb3ef7af62484842ebcb65925 Signed-off-by: Manuel Bottini Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4000 Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Reviewed-by: Michele Di Giorgio Reviewed-by: Michalis Spyrou --- arm_compute/core/NEON/kernels/NEScaleKernel.h | 34 +- arm_compute/runtime/NEON/functions/NEScale.h | 49 +- src/core/NEON/kernels/NEScaleKernel.cpp | 1208 ++++++-------------- src/runtime/NEON/functions/NEGaussianPyramid.cpp | 2 +- .../NEON/functions/NELaplacianReconstruct.cpp | 2 +- src/runtime/NEON/functions/NEScale.cpp | 65 +- tests/datasets/BorderModeDataset.h | 4 +- tests/validation/NEON/Scale.cpp | 92 +- tests/validation/fixtures/ScaleFixture.h | 2 +- 9 files changed, 481 insertions(+), 977 deletions(-) diff --git a/arm_compute/core/NEON/kernels/NEScaleKernel.h b/arm_compute/core/NEON/kernels/NEScaleKernel.h index a2328b13bc..b35bb72741 100644 --- a/arm_compute/core/NEON/kernels/NEScaleKernel.h +++ b/arm_compute/core/NEON/kernels/NEScaleKernel.h @@ -83,34 +83,44 @@ public: // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; - BorderSize border_size() const override; private: - /** function to perform scale using nearest interpolation on the given window */ - void scale_nearest_nchw(const Window &window); - /** function to perform scale using bilinear interpolation on the given window */ - void scale_bilinear_nchw(const Window &window); /** function to perform scale using area interpolation on the given window * * @note Used only in case down-sampling. */ - void scale_area_nchw(const Window &window); - /** function to perform scale on the given window */ - void scale_nhwc(const Window &window); - /** Scale function to use for the particular interpolation type passed to configure() */ - void (NEScaleKernel::*_func)(const Window &window); + void scale_area_nchw_u8(const Window &window); + + /** function to perform scale using bilinear interpolation on the given window */ + template + void scale_bilinear_nchw(const Window &window); + /** function to perform scale using bilinear interpolation on the given window */ + template + void scale_bilinear_nhwc(const Window &window); + /** function to perform scale using bilinear interpolation on the given window */ + template + void scale_bilinear_qasymm(const Window &window); + + /** function to perform scale using nearest neighbour on the given window */ + template + void scale_nearest_nchw(const Window &window); + /** function to perform scale using nearest neighbour on the given window */ + template + void scale_nearest_nhwc(const Window &window); + + /** Scale function to use for the particular function to use */ + using ScaleFunctionPtr = void (NEScaleKernel::*)(const Window &window); + ScaleFunctionPtr _func; const ITensor *_offsets; const ITensor *_dx; const ITensor *_dy; const ITensor *_input; ITensor *_output; InterpolationPolicy _policy; - BorderSize _border_size; BorderMode _border_mode; PixelValue _constant_border_value; float _sampling_offset; - bool _use_padding; bool _align_corners; }; } // namespace arm_compute diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h index f149e3bbb9..4063e558a6 100644 --- a/arm_compute/runtime/NEON/functions/NEScale.h +++ b/arm_compute/runtime/NEON/functions/NEScale.h @@ -24,20 +24,17 @@ #ifndef ARM_COMPUTE_NESCALEIMAGE_H #define ARM_COMPUTE_NESCALEIMAGE_H -#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" #include "arm_compute/core/NEON/kernels/NEScaleKernel.h" #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" #include "arm_compute/runtime/Tensor.h" -#include - namespace arm_compute { class ITensor; /** Basic function to run @ref NEScaleKernel */ -class NEScale : public IFunction +class NEScale : public INESimpleFunctionNoBorder { public: /** Constructor @@ -45,20 +42,6 @@ public: * Initialize NEScale */ NEScale(); - /** Initialize the function's source, destination, interpolation type and border_mode. - * - * @param[in, out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED) - * @param[out] output Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. - * @param[in] policy The interpolation type. - * @param[in] border_mode Strategy to use for borders. - * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. - * @param[in] sampling_policy (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER - * @param[in] use_padding (Optional) Is padding in use or not. Defaults to true. - * @param[in] align_corners (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false. - */ - ARM_COMPUTE_DEPRECATED_REL(20.08) - void configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue(), - SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false); /** Initialize the function's source, destination, interpolation type and border_mode. * * @param[in, out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED) @@ -66,22 +49,6 @@ public: * @param[in] info @ref ScaleKernelInfo to be used for configuration */ void configure(ITensor *input, ITensor *output, const ScaleKernelInfo &info); - /** Static function to check if given info will lead to a valid configuration of @ref NEScale - * - * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED) - * @param[in] output Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. - * @param[in] policy The interpolation type. - * @param[in] border_mode Strategy to use for borders. - * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. - * @param[in] sampling_policy (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER - * @param[in] use_padding (Optional) Is padding in use or not. Defaults to true. - * @param[in] align_corners (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false. - * - * @return a status - */ - ARM_COMPUTE_DEPRECATED_REL(20.08) - static Status validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, BorderMode border_mode, - PixelValue constant_border_value = PixelValue(), SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false); /** Static function to check if given info will lead to a valid configuration of @ref NEScale * * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED) @@ -92,16 +59,10 @@ public: */ static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info); - // Inherited methods overridden: - void run() override; - private: - Tensor _offsets; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */ - Tensor _dx; /**< Element's distance between the X real coordinate and the smallest X following integer */ - Tensor _dy; /**< Element's distance between the Y real coordinate and the smallest Y following integer */ - NEScaleKernel _scale_kernel; /**< Kernel to perform the scaling */ - NEFillBorderKernel _border_handler; /**< kernel to handle tensor borders */ - bool _use_padding; /**< Is padding used on the tensors */ + Tensor _offsets; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */ + Tensor _dx; /**< Element's distance between the X real coordinate and the smallest X following integer */ + Tensor _dy; /**< Element's distance between the Y real coordinate and the smallest Y following integer */ }; } // namespace arm_compute #endif /*ARM_COMPUTE_NESCALEIMAGE_H */ diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp index 94fcfe2ff8..1a853527b9 100644 --- a/src/core/NEON/kernels/NEScaleKernel.cpp +++ b/src/core/NEON/kernels/NEScaleKernel.cpp @@ -34,11 +34,24 @@ #include "src/core/utils/ScaleUtils.h" #include +#include namespace arm_compute { namespace { +inline float compute_bilinear(float a00, float a01, float a10, float a11, float dx_val, float dy_val) +{ + const float dx1_val = 1.0f - dx_val; + const float dy1_val = 1.0f - dy_val; + + const float w1 = dx1_val * dy1_val; + const float w2 = dx_val * dy1_val; + const float w3 = dx1_val * dy_val; + const float w4 = dx_val * dy_val; + return a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4; +} + Status validate_arguments(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info) { @@ -48,8 +61,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *dx, const ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON(output == input); ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT); - ARM_COMPUTE_RETURN_ERROR_ON(!info.use_padding && info.border_mode != BorderMode::CONSTANT); ARM_COMPUTE_UNUSED(info.constant_border_value); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.use_padding, "Padding is not supported"); const DataLayout data_layout = input->data_layout(); const auto width_index = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); @@ -71,7 +84,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *dx, const ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32); } - ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy)); + ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy)); if(info.interpolation_policy == InterpolationPolicy::AREA) { @@ -81,265 +94,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *dx, const return Status{}; } - -std::pair validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *dx, ITensorInfo *dy, ITensorInfo *offsets, ITensorInfo *output, - const ScaleKernelInfo &info, BorderSize border_size) -{ - bool window_changed{ false }; - Window win{}; - - constexpr unsigned int num_elems_processed_per_iteration = 16; - - // Configure kernel window - win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); - - const ValidRegion &input_valid_region = input->valid_region(); - - if(offsets != nullptr) - { - AccessWindowHorizontal offsets_access(offsets, 0, num_elems_processed_per_iteration); - window_changed = window_changed || update_window_and_padding(win, offsets_access); - } - if(dx != nullptr && dy != nullptr) - { - AccessWindowHorizontal dx_access(dx, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal dy_access(dy, 0, num_elems_processed_per_iteration); - window_changed = window_changed || update_window_and_padding(win, dx_access, dy_access); - } - - // Reads can occur within the valid region of the input - AccessWindowStatic input_access(input, input_valid_region.anchor[0] - border_size.left, - input_valid_region.anchor[1] - border_size.top, - input_valid_region.anchor[0] + input_valid_region.shape[0] + border_size.right, - input_valid_region.anchor[1] + input_valid_region.shape[1] + border_size.bottom); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - window_changed = window_changed || update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, calculate_valid_region_scale(*input, output->tensor_shape(), - info.interpolation_policy, info.sampling_policy, info.border_mode == BorderMode::UNDEFINED)); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} - -std::pair validate_and_configure_window_nhwc(ITensorInfo *input, ITensorInfo *output, const ScaleKernelInfo &info, BorderSize border_size) -{ - bool window_changed{ false }; - Window win{}; - - const unsigned int num_elems_processed_per_iteration = (info.use_padding && info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR) ? 16 / input->element_size() : 1; - - // Configure kernel window - win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); - - if(info.use_padding) - { - AccessWindowStatic input_access(input, 0, -border_size.top, ceil_to_multiple(input->tensor_shape()[0], num_elems_processed_per_iteration), input->tensor_shape()[1]); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - window_changed = update_window_and_padding(win, input_access, output_access); - output->set_valid_region(calculate_valid_region_scale(*input, output->tensor_shape(), info.interpolation_policy, info.sampling_policy, info.border_mode == BorderMode::UNDEFINED)); - } - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} - -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *dx, ITensorInfo *dy, ITensorInfo *offsets, ITensorInfo *output, - const ScaleKernelInfo &info, BorderSize border_size) -{ - std::pair win_config; - switch(input->data_layout()) - { - case DataLayout::NCHW: - if(!info.use_padding) - { - return std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Padding required for NCHW"), Window{}); - } - win_config = validate_and_configure_window_nchw(input, dx, dy, offsets, output, info, border_size); - break; - case DataLayout::NHWC: - win_config = validate_and_configure_window_nhwc(input, output, info, border_size); - break; - default: - win_config = std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported data layout!"), Window{}); - } - - return win_config; -} - -template -inline void scale_nearest_nhwc_core(const ITensor *input, const ITensor *offsets, ITensor *output, - float hr, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c, float sampling_offset, bool align_corners) -{ - const int window_step_x = 16 / sizeof(T); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - window.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator in(input, win_in); - Iterator out(output, window); - - const size_t offsets_stride = stride_w / sizeof(T); - - execute_window_loop(window, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto in_yi = static_cast(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_yi * stride_h; - int32_t x = window_start_x; - for(; x < window_end_x - window_step_x; x += window_step_x) - { - wrapper::vstore(reinterpret_cast(out.ptr()) + x, - wrapper::vloadq(reinterpret_cast(in.ptr() + offset * offsets_stride + offset_row + x * stride_c))); - } - for(; x < window_end_x; ++x) - { - *(reinterpret_cast(out.ptr()) + x) = - *(reinterpret_cast(in.ptr() + offset * offsets_stride + offset_row + x * stride_c)); - } - }, - in, out); -} - -template -inline void scale_bilinear_nhwc_core(const ITensor *input, const ITensor *offsets, const ITensor *dx, const ITensor *dy, ITensor *output, - float hr, float sampling_offset, Window window, const Window &win_in, size_t stride_w, size_t stride_h, - size_t stride_c, BorderMode border_mode, PixelValue constant_border_value, bool use_padding) -{ - Iterator in(input, win_in); - Iterator out(output, window); - - const size_t stride_w_elems = stride_w / sizeof(T); - const size_t stride_h_elems = stride_h / sizeof(T); - - const int input_width = input->info()->dimension(1); - const int input_height = input->info()->dimension(2); - - T border_value; - if(use_padding && border_mode != BorderMode::REPLICATE) - { - // configure() sets top border to 0 for BorderMode::REPLICATE and border_value is not needed in execute_window_loop() for REPLICATE - border_value = *reinterpret_cast(input->buffer() + input->info()->offset_first_element_in_bytes() - stride_w); - } - else - { - border_value = static_cast(constant_border_value.get()); - } - - auto is_valid = [](int64_t x, int64_t low_x, int64_t high_x, int64_t y, int64_t low_y, int64_t high_y) - { - return !(x < low_x || x > high_x || y < low_y || y > high_y); - }; - - int border_size = (border_mode == BorderMode::UNDEFINED) ? 0 : 1; - - const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform(); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = (*reinterpret_cast(offsets->ptr_to_element(Coordinates(id.y(), id.z())))) / static_cast(sizeof(T)); - const auto dx_scale = *reinterpret_cast(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_scale = *reinterpret_cast(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int in_yi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - const int offset_row = in_yi * stride_h + id.x() * stride_c; - const T *in_ptr = reinterpret_cast(in.ptr() + offset * stride_w + offset_row); - - if(is_valid(offset, -border_size, input_width - 1 + border_size, in_yi, -border_size, input_height - 1 + border_size)) - { - T a00 = 0; - T a01 = 0; - T a10 = 0; - T a11 = 0; - - if(border_mode == BorderMode::CONSTANT) - { - a00 = is_valid(offset, 0, input_width - 1, in_yi, 0, input_height - 1) ? *in_ptr : border_value; - a01 = is_valid(offset + 1, 0, input_width - 1, in_yi, 0, input_height - 1) ? *(in_ptr + stride_w_elems) : border_value; - a10 = is_valid(offset, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems) : border_value; - a11 = is_valid(offset + 1, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems + stride_w_elems) : border_value; - } - else if(border_mode == BorderMode::REPLICATE) - { - auto clamped_x = utility::clamp(offset, 0, input_width - 1); - auto clamped_x1 = utility::clamp(offset + 1, 0, input_width - 1); - auto clamped_y = utility::clamp(in_yi, 0, input_height - 1); - auto clamped_y1 = utility::clamp(in_yi + 1, 0, input_height - 1); - - a00 = *reinterpret_cast(in.ptr() + clamped_x * stride_w + clamped_y * stride_h + id.x() * stride_c); - a01 = *reinterpret_cast(in.ptr() + clamped_x1 * stride_w + clamped_y * stride_h + id.x() * stride_c); - a10 = *reinterpret_cast(in.ptr() + clamped_x * stride_w + clamped_y1 * stride_h + id.x() * stride_c); - a11 = *reinterpret_cast(in.ptr() + clamped_x1 * stride_w + clamped_y1 * stride_h + id.x() * stride_c); - } - else - { - a00 = is_valid(offset, 0, input_width - 1, in_yi, 0, input_height - 1) ? *in_ptr : 0; - a01 = is_valid(offset + 1, 0, input_width - 1, in_yi, 0, input_height - 1) ? *(in_ptr + stride_w_elems) : 0; - a10 = is_valid(offset, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems) : 0; - a11 = is_valid(offset + 1, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems + stride_w_elems) : 0; - } - - // Perform interpolation - const float dx1 = 1.0f - dx_scale; - const float dy1 = 1.0f - dy_scale; - - const float w1 = dx1 * dy1; - const float w2 = dx_scale * dy1; - const float w3 = dx1 * dy_scale; - const float w4 = dx_scale * dy_scale; - - T res = 0; - //dequantize quantized input - if(input->info()->data_type() == DataType::QASYMM8) - { - float inp00 = dequantize_qasymm8(a00, iq_info); - float inp01 = dequantize_qasymm8(a01, iq_info); - float inp10 = dequantize_qasymm8(a10, iq_info); - float inp11 = dequantize_qasymm8(a11, iq_info); - res = static_cast(quantize_qasymm8((inp00 * w1 + inp01 * w2 + inp10 * w3 + inp11 * w4), oq_info)); - } - else if(input->info()->data_type() == DataType::QASYMM8_SIGNED) - { - float inp00 = dequantize_qasymm8_signed(a00, iq_info); - float inp01 = dequantize_qasymm8_signed(a01, iq_info); - float inp10 = dequantize_qasymm8_signed(a10, iq_info); - float inp11 = dequantize_qasymm8_signed(a11, iq_info); - res = static_cast(quantize_qasymm8_signed((inp00 * w1 + inp01 * w2 + inp10 * w3 + inp11 * w4), oq_info)); - } - else - { - res = static_cast(a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4); - } - // Store result - *reinterpret_cast(out.ptr()) = res; - } - else - { - if(border_mode == BorderMode::CONSTANT) - { - *reinterpret_cast(out.ptr()) = border_value; - } - else if(border_mode == BorderMode::REPLICATE) - { - auto clamped_x = utility::clamp(offset, 0, input_width - 1); - auto clamped_y = utility::clamp(in_yi, 0, input_height - 1); - *reinterpret_cast(out.ptr()) = *reinterpret_cast(in.ptr() + clamped_x * stride_w + clamped_y * stride_h + id.x() * stride_c); - } - } - }, - in, out); -} } // namespace NEScaleKernel::NEScaleKernel() - : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr), _policy(), _border_size(1), _border_mode(), _constant_border_value(PixelValue()), - _sampling_offset(0), _use_padding(true), _align_corners(false) -{ -} - -BorderSize NEScaleKernel::border_size() const + : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr), _policy(), _border_mode(), _constant_border_value(PixelValue()), _sampling_offset(0), + _align_corners(false) { - return _border_size; } void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, @@ -365,10 +125,8 @@ void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITe _dx = dx; _dy = dy; _policy = info.interpolation_policy; - _border_size = BorderSize(1); _border_mode = info.border_mode; _constant_border_value = info.constant_border_value; - _use_padding = info.use_padding; _align_corners = info.align_corners; if(info.sampling_policy == SamplingPolicy::CENTER) @@ -377,58 +135,85 @@ void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITe } // Compute the ratio between source width/height and destination width/height - const auto wr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), _align_corners); - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), _align_corners); + const auto wr = scale_utils::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), _align_corners); + const auto hr = scale_utils::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), _align_corners); - // Add constant border only on top in case of NHWC layout - if(data_layout == DataLayout::NHWC) + // Area interpolation behaves as Nearest Neighbour in case of up-sampling + const auto policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : _policy; + + if(_border_mode == BorderMode::UNDEFINED) { - _border_size = (info.border_mode != BorderMode::REPLICATE && info.interpolation_policy == InterpolationPolicy::BILINEAR && info.use_padding) ? BorderSize(1, 0, 0, 0) : BorderSize(0); + _border_mode = BorderMode::CONSTANT; + _constant_border_value = PixelValue(); } + std::string function_to_call("scale_"); + function_to_call += string_from_data_type(_input->info()->data_type()) + "_"; + function_to_call += string_from_data_layout(_input->info()->data_layout()) + "_"; + function_to_call += string_from_interpolation_policy(policy_to_use); - // Area interpolation behaves as Nearest Neighbour in case of up-sampling - const auto policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : _policy; + static std::map map_function = + { + { "scale_U8_NCHW_AREA_CONSTANT", &NEScaleKernel::scale_area_nchw_u8 }, + + { "scale_U8_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_nchw }, + { "scale_U8_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw }, - // Select interpolation function - switch(policy_to_use) + { "scale_U8_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_nhwc }, + { "scale_U8_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc }, + + { "scale_QASYMM8_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_qasymm }, + { "scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw }, + + { "scale_QASYMM8_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_qasymm }, + { "scale_QASYMM8_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc }, + + { "scale_QASYMM8_SIGNED_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_qasymm }, + { "scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw }, + + { "scale_QASYMM8_SIGNED_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_qasymm }, + { "scale_QASYMM8_SIGNED_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc }, + + { "scale_S16_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_nchw }, + { "scale_S16_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw }, + + { "scale_S16_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_nhwc }, + { "scale_S16_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc }, + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + { "scale_F16_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_nchw }, + { "scale_F16_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw }, + + { "scale_F16_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_nhwc }, + { "scale_F16_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc }, +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + + { "scale_F32_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_nchw }, + { "scale_F32_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw }, + + { "scale_F32_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_nhwc }, + { "scale_F32_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc }, + }; + auto it = map_function.find(function_to_call); + if(it != map_function.end()) { - case InterpolationPolicy::NEAREST_NEIGHBOR: - { - _func = (data_layout == DataLayout::NCHW) ? &NEScaleKernel::scale_nearest_nchw : &NEScaleKernel::scale_nhwc; - break; - } - case InterpolationPolicy::BILINEAR: - { - _func = (data_layout == DataLayout::NCHW) ? &NEScaleKernel::scale_bilinear_nchw : &NEScaleKernel::scale_nhwc; - break; - } - case InterpolationPolicy::AREA: - { - _func = &NEScaleKernel::scale_area_nchw; - break; - } - default: - ARM_COMPUTE_ERROR("Unsupported interpolation mode"); + _func = it->second; } // Configure window - std::pair win_config = validate_and_configure_window(input->info(), - dx != nullptr ? dx->info() : nullptr, - dy != nullptr ? dy->info() : nullptr, - offsets != nullptr ? offsets->info() : nullptr, - output->info(), - info, border_size()); - - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - INEKernel::configure(win_config.second); + Window win = calculate_max_window(*output->info(), Steps()); + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + INEKernel::configure(win); } +template void NEScaleKernel::scale_nearest_nchw(const Window &window) { - const size_t input_stride = _input->info()->strides_in_bytes()[1]; + const size_t in_dim_x = _input->info()->dimension(0); // Compute the ratio between source height and destination height - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners); + const auto hr = scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners); // Don't increment in X and Y direction for the input tensor // A pointer to the start of this plane is needed as base for the precomputed offsets @@ -449,216 +234,24 @@ void NEScaleKernel::scale_nearest_nchw(const Window &window) Iterator in(_input, win_in); Iterator out(_output, window); Iterator offsets(_offsets, win_off); - - switch(_input->info()->data_type()) + execute_window_loop(window, [&](const Coordinates & id) { - case DataType::QASYMM8_SIGNED: - { - int8x16_t tmp = vdupq_n_s8(0); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offsets_ptr = reinterpret_cast(offsets.ptr()); - const uint8_t *const in_ptr = in.ptr(); - - const auto in_yi = static_cast(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr)); - const int in_yi_clamped = std::min(static_cast(_input->info()->dimension(1)), std::max(in_yi, -1)); - ARM_COMPUTE_ERROR_ON(in_yi_clamped < -1 || in_yi_clamped > static_cast(_input->info()->dimension(1))); - const int offset_row = in_yi_clamped * input_stride; - - tmp = vsetq_lane_s8(in_ptr[offsets_ptr[0] + offset_row], tmp, 0); - tmp = vsetq_lane_s8(in_ptr[offsets_ptr[1] + offset_row], tmp, 1); - tmp = vsetq_lane_s8(in_ptr[offsets_ptr[2] + offset_row], tmp, 2); - tmp = vsetq_lane_s8(in_ptr[offsets_ptr[3] + offset_row], tmp, 3); - tmp = vsetq_lane_s8(in_ptr[offsets_ptr[4] + offset_row], tmp, 4); - tmp = vsetq_lane_s8(in_ptr[offsets_ptr[5] + offset_row], tmp, 5); - tmp = vsetq_lane_s8(in_ptr[offsets_ptr[6] + offset_row], tmp, 6); - tmp = vsetq_lane_s8(in_ptr[offsets_ptr[7] + offset_row], tmp, 7); - tmp = vsetq_lane_s8(in_ptr[offsets_ptr[8] + offset_row], tmp, 8); - tmp = vsetq_lane_s8(in_ptr[offsets_ptr[9] + offset_row], tmp, 9); - tmp = vsetq_lane_s8(in_ptr[offsets_ptr[10] + offset_row], tmp, 10); - tmp = vsetq_lane_s8(in_ptr[offsets_ptr[11] + offset_row], tmp, 11); - tmp = vsetq_lane_s8(in_ptr[offsets_ptr[12] + offset_row], tmp, 12); - tmp = vsetq_lane_s8(in_ptr[offsets_ptr[13] + offset_row], tmp, 13); - tmp = vsetq_lane_s8(in_ptr[offsets_ptr[14] + offset_row], tmp, 14); - tmp = vsetq_lane_s8(in_ptr[offsets_ptr[15] + offset_row], tmp, 15); - - vst1q_s8(reinterpret_cast(out.ptr()), tmp); - }, - in, offsets, out); - break; - } - case DataType::QASYMM8: - case DataType::U8: - { - uint8x16_t tmp = vdupq_n_u8(0); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offsets_ptr = reinterpret_cast(offsets.ptr()); - const uint8_t *const in_ptr = in.ptr(); - - const auto in_yi = static_cast(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr)); - const int in_yi_clamped = std::min(static_cast(_input->info()->dimension(1)), std::max(in_yi, -1)); - ARM_COMPUTE_ERROR_ON(in_yi_clamped < -1 || in_yi_clamped > static_cast(_input->info()->dimension(1))); - const int offset_row = in_yi_clamped * input_stride; - - tmp = vsetq_lane_u8(in_ptr[offsets_ptr[0] + offset_row], tmp, 0); - tmp = vsetq_lane_u8(in_ptr[offsets_ptr[1] + offset_row], tmp, 1); - tmp = vsetq_lane_u8(in_ptr[offsets_ptr[2] + offset_row], tmp, 2); - tmp = vsetq_lane_u8(in_ptr[offsets_ptr[3] + offset_row], tmp, 3); - tmp = vsetq_lane_u8(in_ptr[offsets_ptr[4] + offset_row], tmp, 4); - tmp = vsetq_lane_u8(in_ptr[offsets_ptr[5] + offset_row], tmp, 5); - tmp = vsetq_lane_u8(in_ptr[offsets_ptr[6] + offset_row], tmp, 6); - tmp = vsetq_lane_u8(in_ptr[offsets_ptr[7] + offset_row], tmp, 7); - tmp = vsetq_lane_u8(in_ptr[offsets_ptr[8] + offset_row], tmp, 8); - tmp = vsetq_lane_u8(in_ptr[offsets_ptr[9] + offset_row], tmp, 9); - tmp = vsetq_lane_u8(in_ptr[offsets_ptr[10] + offset_row], tmp, 10); - tmp = vsetq_lane_u8(in_ptr[offsets_ptr[11] + offset_row], tmp, 11); - tmp = vsetq_lane_u8(in_ptr[offsets_ptr[12] + offset_row], tmp, 12); - tmp = vsetq_lane_u8(in_ptr[offsets_ptr[13] + offset_row], tmp, 13); - tmp = vsetq_lane_u8(in_ptr[offsets_ptr[14] + offset_row], tmp, 14); - tmp = vsetq_lane_u8(in_ptr[offsets_ptr[15] + offset_row], tmp, 15); - - vst1q_u8(out.ptr(), tmp); - }, - in, offsets, out); - break; - } - case DataType::S16: - { - int16x8x2_t tmp = - { - { - vdupq_n_s16(0), - vdupq_n_s16(0) - } - }; - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offsets_ptr = reinterpret_cast(offsets.ptr()); - const auto in_yi = static_cast(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr)); - const int offset_row = in_yi * input_stride; - - tmp.val[0] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0); - tmp.val[0] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[0], 1); - tmp.val[0] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 2); - tmp.val[0] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[0], 3); - tmp.val[0] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 4); - tmp.val[0] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[0], 5); - tmp.val[0] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 6); - tmp.val[0] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[0], 7); - - tmp.val[1] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0); - tmp.val[1] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[1], 1); - tmp.val[1] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 2); - tmp.val[1] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[1], 3); - tmp.val[1] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 4); - tmp.val[1] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[1], 5); - tmp.val[1] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 6); - tmp.val[1] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[1], 7); - - vst2q_s16(reinterpret_cast(out.ptr()), tmp); - }, - in, offsets, out); - break; - } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - { - float16x8x2_t tmp = - { - { - vdupq_n_f16(0), - vdupq_n_f16(0) - } - }; - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offsets_ptr = reinterpret_cast(offsets.ptr()); - const auto in_yi = static_cast(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr)); - const int offset_row = in_yi * input_stride; - - tmp.val[0] = vsetq_lane_f16(*reinterpret_cast(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0); - tmp.val[0] = vsetq_lane_f16(*reinterpret_cast(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[0], 1); - tmp.val[0] = vsetq_lane_f16(*reinterpret_cast(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 2); - tmp.val[0] = vsetq_lane_f16(*reinterpret_cast(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[0], 3); - tmp.val[0] = vsetq_lane_f16(*reinterpret_cast(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 4); - tmp.val[0] = vsetq_lane_f16(*reinterpret_cast(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[0], 5); - tmp.val[0] = vsetq_lane_f16(*reinterpret_cast(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 6); - tmp.val[0] = vsetq_lane_f16(*reinterpret_cast(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[0], 7); - - tmp.val[1] = vsetq_lane_f16(*reinterpret_cast(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0); - tmp.val[1] = vsetq_lane_f16(*reinterpret_cast(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[1], 1); - tmp.val[1] = vsetq_lane_f16(*reinterpret_cast(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 2); - tmp.val[1] = vsetq_lane_f16(*reinterpret_cast(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[1], 3); - tmp.val[1] = vsetq_lane_f16(*reinterpret_cast(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 4); - tmp.val[1] = vsetq_lane_f16(*reinterpret_cast(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[1], 5); - tmp.val[1] = vsetq_lane_f16(*reinterpret_cast(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 6); - tmp.val[1] = vsetq_lane_f16(*reinterpret_cast(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[1], 7); - - vst2q_f16(reinterpret_cast<__fp16 *>(out.ptr()), tmp); - }, - in, offsets, out); - break; - } -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::F32: - { - float32x4x4_t tmp = - { - { - vdupq_n_f32(0), - vdupq_n_f32(0), - vdupq_n_f32(0), - vdupq_n_f32(0) - } - }; - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offsets_ptr = reinterpret_cast(offsets.ptr()); - const auto in_yi = static_cast(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr)); - const int offset_row = in_yi * input_stride; - - tmp.val[0] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0); - tmp.val[0] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 1); - tmp.val[0] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 2); - tmp.val[0] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 3); - - tmp.val[1] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0); - tmp.val[1] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 1); - tmp.val[1] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 2); - tmp.val[1] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 3); - - tmp.val[2] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[2], 0); - tmp.val[2] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[2], 1); - tmp.val[2] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[2], 2); - tmp.val[2] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[2], 3); - - tmp.val[3] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[3], 0); - tmp.val[3] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[3], 1); - tmp.val[3] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[3], 2); - tmp.val[3] = vsetq_lane_f32(*reinterpret_cast(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[3], 3); - - vst4q_f32(reinterpret_cast(out.ptr()), tmp); - }, - in, offsets, out); - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - } + const auto offsets_ptr = reinterpret_cast(offsets.ptr()); + const auto in_yi = static_cast(_align_corners ? utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr)); + const int32_t offset_row = in_yi * in_dim_x; + *reinterpret_cast(out.ptr()) = *(reinterpret_cast(in.ptr()) + offsets_ptr[0] + offset_row); + }, + in, offsets, out); } +template void NEScaleKernel::scale_bilinear_nchw(const Window &window) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::F32); - // Compute the ratio between source height and destination height - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners); + const auto hr = scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners); + Window win_off; + win_off.set(Window::DimX, window.x()); + win_off.set(Window::DimY, window.y()); // Don't increment in X and Y direction for the input tensor // A pointer to the start of this plane is needed as base for the precomputed offsets @@ -666,10 +259,6 @@ void NEScaleKernel::scale_bilinear_nchw(const Window &window) win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - Window win_off; - win_off.set(Window::DimX, window.x()); - win_off.set(Window::DimY, window.y()); - for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d) { win_off.set(d, Window::Dimension(0, 0, 0)); @@ -681,271 +270,71 @@ void NEScaleKernel::scale_bilinear_nchw(const Window &window) Iterator dx(_dx, win_off); Iterator dy(_dy, win_off); - /* Input image stride */ - const size_t in_stide_in_bytes = _input->info()->strides_in_bytes()[1]; - const size_t in_stride = in_stide_in_bytes / _input->info()->element_size(); - - const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform(); + const int32_t in_dim_w = _input->info()->dimension(0); + const int32_t in_dim_h = _input->info()->dimension(1); - switch(_input->info()->data_type()) + if(_border_mode == BorderMode::CONSTANT) { - case DataType::QASYMM8_SIGNED: - { - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offsets_ptr = reinterpret_cast(offsets.ptr()); - const auto dx_ptr = reinterpret_cast(dx.ptr()); - const auto dy_ptr = reinterpret_cast(dy.ptr()); - const auto in_ptr = reinterpret_cast(in.ptr()); - - const int in_yi = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset); - const int offset_row = in_yi * in_stide_in_bytes; - - int8x8_t tmp0 = vdup_n_s8(0); - - tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0], iq_info, oq_info), tmp0, 0); - tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1], iq_info, oq_info), tmp0, 1); - tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2], iq_info, oq_info), tmp0, 2); - tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3], iq_info, oq_info), tmp0, 3); - tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4], iq_info, oq_info), tmp0, 4); - tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5], iq_info, oq_info), tmp0, 5); - tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6], iq_info, oq_info), tmp0, 6); - tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7], iq_info, oq_info), tmp0, 7); - - int8x8_t tmp1 = vdup_n_s8(0); - - tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8], iq_info, oq_info), tmp1, 0); - tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9], iq_info, oq_info), tmp1, 1); - tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10], iq_info, oq_info), tmp1, 2); - tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11], iq_info, oq_info), tmp1, 3); - tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12], iq_info, oq_info), tmp1, 4); - tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13], iq_info, oq_info), tmp1, 5); - tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14], iq_info, oq_info), tmp1, 6); - tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15], iq_info, oq_info), tmp1, 7); - - vst1q_s8(reinterpret_cast(out.ptr()), vcombine_s8(tmp0, tmp1)); - }, - in, offsets, dx, dy, out); - break; - } - case DataType::QASYMM8: - { - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offsets_ptr = reinterpret_cast(offsets.ptr()); - const auto dx_ptr = reinterpret_cast(dx.ptr()); - const auto dy_ptr = reinterpret_cast(dy.ptr()); - const auto in_ptr = reinterpret_cast(in.ptr()); - - const int in_yi = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset); - const int offset_row = in_yi * in_stide_in_bytes; - - uint8x8_t tmp0 = vdup_n_u8(0); - - tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0], iq_info, oq_info), tmp0, 0); - tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1], iq_info, oq_info), tmp0, 1); - tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2], iq_info, oq_info), tmp0, 2); - tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3], iq_info, oq_info), tmp0, 3); - tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4], iq_info, oq_info), tmp0, 4); - tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5], iq_info, oq_info), tmp0, 5); - tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6], iq_info, oq_info), tmp0, 6); - tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7], iq_info, oq_info), tmp0, 7); - - uint8x8_t tmp1 = vdup_n_u8(0); - - tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8], iq_info, oq_info), tmp1, 0); - tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9], iq_info, oq_info), tmp1, 1); - tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10], iq_info, oq_info), tmp1, 2); - tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11], iq_info, oq_info), tmp1, 3); - tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12], iq_info, oq_info), tmp1, 4); - tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13], iq_info, oq_info), tmp1, 5); - tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14], iq_info, oq_info), tmp1, 6); - tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15], iq_info, oq_info), tmp1, 7); - - vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1)); - }, - in, offsets, dx, dy, out); - break; - } - case DataType::U8: - { - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offsets_ptr = reinterpret_cast(offsets.ptr()); - const auto dx_ptr = reinterpret_cast(dx.ptr()); - const auto dy_ptr = reinterpret_cast(dy.ptr()); - const auto in_ptr = reinterpret_cast(in.ptr()); - - const int in_yi = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset); - const int offset_row = in_yi * in_stide_in_bytes; - - uint8x8_t tmp0 = vdup_n_u8(0); - - tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0); - tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1); - tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2); - tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3); - tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4); - tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5); - tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6); - tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7); - - uint8x8_t tmp1 = vdup_n_u8(0); - - tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0); - tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1); - tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2); - tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3); - tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4); - tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5); - tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6); - tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7); - - vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1)); - }, - in, offsets, dx, dy, out); - break; - } - case DataType::S16: - { - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offsets_ptr = reinterpret_cast(offsets.ptr()); - const auto dx_ptr = reinterpret_cast(dx.ptr()); - const auto dy_ptr = reinterpret_cast(dy.ptr()); - - const int in_yi = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset); - const int offset_row = in_yi * in_stide_in_bytes; - - int16x8x2_t tmp = - { - { - vdupq_n_s16(0), - vdupq_n_s16(0) - } - }; - - tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0); - tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[0], 1); - tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 2); - tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[0], 3); - tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 4); - tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[0], 5); - tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 6); - tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[0], 7); - - tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0); - tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[1], 1); - tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 2); - tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[1], 3); - tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 4); - tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[1], 5); - tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 6); - tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[1], 7); - - vst2q_s16(reinterpret_cast(out.ptr()), tmp); - }, - in, offsets, dx, dy, out); - break; - } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - { - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offsets_ptr = reinterpret_cast(offsets.ptr()); - const auto dx_ptr = reinterpret_cast(dx.ptr()); - const auto dy_ptr = reinterpret_cast(dy.ptr()); - - const int in_yi = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset); - const int offset_row = in_yi * in_stide_in_bytes; - - float16x8x2_t tmp = - { - { - vdupq_n_f16(0), - vdupq_n_f16(0) - } - }; - - tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0); - tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[0], 1); - tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 2); - tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[0], 3); - tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 4); - tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[0], 5); - tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 6); - tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[0], 7); - - tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0); - tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[1], 1); - tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 2); - tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[1], 3); - tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 4); - tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[1], 5); - tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 6); - tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[1], 7); - - vst2q_f16(reinterpret_cast<__fp16 *>(out.ptr()), tmp); - }, - in, offsets, dx, dy, out); - break; - } + using ConstType = typename std::conditional::value, half, T>::type; +#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + using ConstType = T; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::F32: + const T const_border_value = static_cast(_constant_border_value.get()); + execute_window_loop(window, [&](const Coordinates & id) { - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offsets_ptr = reinterpret_cast(offsets.ptr()); - const auto dx_ptr = reinterpret_cast(dx.ptr()); - const auto dy_ptr = reinterpret_cast(dy.ptr()); - - const int in_yi = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset); - const int offset_row = in_yi * in_stide_in_bytes; - - float32x4x4_t tmp = - { - { - vdupq_n_f32(0), - vdupq_n_f32(0), - vdupq_n_f32(0), - vdupq_n_f32(0) - } - }; - - tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0); - tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 1); - tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 2); - tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 3); - - tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0); - tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 1); - tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 2); - tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 3); - - tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[2], 0); - tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[2], 1); - tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[2], 2); - tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[2], 3); - - tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[3], 0); - tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[3], 1); - tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[3], 2); - tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[3], 3); - - vst4q_f32(reinterpret_cast(out.ptr()), tmp); - }, - in, offsets, dx, dy, out); - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - break; + const int32_t index_h = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset); + const auto index_w = *(reinterpret_cast(offsets.ptr())); + const auto dx_val = *(reinterpret_cast(dx.ptr())); + const auto dy_val = *(reinterpret_cast(dy.ptr())); + const auto pixel_row_ptr = reinterpret_cast(in.ptr()); + + const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + index_h * in_dim_w)) : const_border_value; + const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + 1 + index_h * in_dim_w)) : const_border_value; + const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h + && index_h < in_dim_h - 1) ? + (*(pixel_row_ptr + index_w + index_h * in_dim_w + in_dim_w)) : + const_border_value; + const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h + && index_h < in_dim_h - 1) ? + (*(pixel_row_ptr + index_w + 1 + index_h * in_dim_w + in_dim_w)) : + const_border_value; + + *reinterpret_cast(out.ptr()) = static_cast(compute_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, offsets, dx, dy, out); + } + else if(_border_mode == BorderMode::REPLICATE) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const int index_h = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset); + const auto index_w = *(reinterpret_cast(offsets.ptr())); + const auto dx_val = *(reinterpret_cast(dx.ptr())); + const auto dy_val = *(reinterpret_cast(dy.ptr())); + const auto pixel_row_ptr = reinterpret_cast(in.ptr()); + + auto clamped_x = utility::clamp(index_w, 0, in_dim_w - 1); + auto clamped_x1 = utility::clamp(index_w + 1, 0, in_dim_w - 1); + auto clamped_y = utility::clamp(index_h, 0, in_dim_h - 1); + auto clamped_y1 = utility::clamp(index_h + 1, 0, in_dim_h - 1); + + const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_dim_w); + const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_dim_w); + const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_dim_w); + const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_dim_w); + + *reinterpret_cast(out.ptr()) = static_cast(compute_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, offsets, dx, dy, out); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); } } -void NEScaleKernel::scale_area_nchw(const Window &window) +void NEScaleKernel::scale_area_nchw_u8(const Window &window) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8); @@ -959,8 +348,8 @@ void NEScaleKernel::scale_area_nchw(const Window &window) Iterator in(_input, win_in); Iterator out(_output, window); - const auto wr = arm_compute::scale_utils::calculate_resize_ratio(_input->info()->dimension(0), _output->info()->dimension(0), _align_corners); - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners); + const auto wr = scale_utils::calculate_resize_ratio(_input->info()->dimension(0), _output->info()->dimension(0), _align_corners); + const auto hr = scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners); const auto w = _input->info()->dimension(0); const auto h = _input->info()->dimension(1); const size_t in_stride = _input->info()->strides_in_bytes()[1]; @@ -994,123 +383,234 @@ void NEScaleKernel::scale_area_nchw(const Window &window) in, out); } -void NEScaleKernel::scale_nhwc(const Window &window) +template +void NEScaleKernel::scale_nearest_nhwc(const Window &window) { - // Get data layout and width/height indices - const DataLayout data_layout = DataLayout::NHWC; - const int idx_channels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - - const size_t input_stride_w = _input->info()->strides_in_bytes()[idx_width]; - const size_t input_stride_h = _input->info()->strides_in_bytes()[idx_height]; - const size_t input_stride_c = _input->info()->strides_in_bytes()[idx_channels]; + const size_t in_dim_w = _input->info()->dimension(1); + const size_t in_dim_h = _input->info()->dimension(2); + const size_t in_dim_c = _input->info()->dimension(0); + const size_t in_dim_wc = in_dim_w * in_dim_c; // Compute the ratio between source height and destination height - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(_input->info()->dimension(idx_height), _output->info()->dimension(idx_height), _align_corners); + const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, _output->info()->dimension(2), _align_corners); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const int window_step_x = 16 / sizeof(T); - // Don't increment in width/height/channels for the input tensor + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + // Don't increment in X and Y direction for the input tensor // A pointer to the start of this plane is needed as base for the precomputed offsets Window win_in(window); win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + Iterator in(_input, win_in); + Iterator out(_output, win); - switch(_input->info()->data_type()) + execute_window_loop(win, [&](const Coordinates & id) { - case DataType::QASYMM8_SIGNED: - { - if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR) - { - scale_nearest_nhwc_core(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners); - } - else - { - scale_bilinear_nhwc_core(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset, - window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding); - } - break; - } - case DataType::QASYMM8: - case DataType::U8: + const int32_t offset = *reinterpret_cast(_offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_dim_c; + const auto in_hi = static_cast(_align_corners ? utils::rounding::round_half_away_from_zero((id.z() + _sampling_offset) * hr) : std::floor((id.z() + _sampling_offset) * hr)); + const int offset_row = in_hi * in_dim_wc; + int32_t x = window_start_x; + for(; x <= window_end_x - window_step_x; x += window_step_x) { - if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR) - { - scale_nearest_nhwc_core(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners); - } - else - { - scale_bilinear_nhwc_core(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset, - window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding); - } - break; + wrapper::vstore(reinterpret_cast(out.ptr()) + x, + wrapper::vloadq(reinterpret_cast(in.ptr()) + offset + offset_row + x)); } - case DataType::S16: + for(; x < window_end_x; ++x) { - if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR) - { - scale_nearest_nhwc_core(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners); - } - else - { - scale_bilinear_nhwc_core(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset, - window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding); - } - break; + *(reinterpret_cast(out.ptr()) + x) = *(reinterpret_cast(in.ptr()) + offset + offset_row + x); } + }, + in, out); +} + +template +void NEScaleKernel::scale_bilinear_nhwc(const Window &window) +{ + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(_input->info()->dimension(2), _output->info()->dimension(2), _align_corners); + + Iterator out(_output, window); + const int in_dim_c = _input->info()->dimension(0); + const int in_dim_w = _input->info()->dimension(1); + const int in_dim_h = _input->info()->dimension(2); + const int input_wc = in_dim_c * in_dim_w; + + // Don't increment in Y and Z direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + Iterator in(_input, win_in); + + if(_border_mode == BorderMode::CONSTANT) + { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - { - if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR) - { - scale_nearest_nhwc_core(_input, _offsets, _output, hr, - window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners); - } - else - { - scale_bilinear_nhwc_core(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset, - window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding); - } - break; - } + using ConstType = typename std::conditional::value, half, T>::type; +#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + using ConstType = T; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::F32: + const T const_border_value = static_cast(_constant_border_value.get()); + execute_window_loop(window, [&](const Coordinates & id) { - if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR) - { - scale_nearest_nhwc_core(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners); - } - else - { - scale_bilinear_nhwc_core(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset, - window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding); - } - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - break; + const auto offset = *reinterpret_cast(_offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast(_dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast(_dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int32_t in_hi = std::floor((id.z() + _sampling_offset) * hr - _sampling_offset); + const T *in_ptr = reinterpret_cast(in.ptr()) + offset * in_dim_c + in_hi * input_wc; + + const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; + const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_dim_c) : const_border_value; + const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + input_wc) : const_border_value; + const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_dim_c + input_wc) : const_border_value; + + *reinterpret_cast(out.ptr()) = static_cast(compute_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else if(_border_mode == BorderMode::REPLICATE) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offset = *reinterpret_cast(_offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast(_dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast(_dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int in_hi = std::floor((id.z() + _sampling_offset) * hr - _sampling_offset); + + auto clamped_w = utility::clamp(offset, 0, in_dim_w - 1); + auto clamped_w1 = utility::clamp(offset + 1, 0, in_dim_w - 1); + auto clamped_h = utility::clamp(in_hi, 0, in_dim_h - 1); + auto clamped_h1 = utility::clamp(in_hi + 1, 0, in_dim_h - 1); + + const auto a00 = *(reinterpret_cast(in.ptr()) + clamped_w * in_dim_c + clamped_h * input_wc); + const auto a01 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_dim_c + clamped_h * input_wc); + const auto a10 = *(reinterpret_cast(in.ptr()) + clamped_w * in_dim_c + clamped_h1 * input_wc); + const auto a11 = *(reinterpret_cast(in.ptr()) + clamped_w1 * in_dim_c + clamped_h1 * input_wc); + + *reinterpret_cast(out.ptr()) = static_cast(compute_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); } } -Status NEScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy, - const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info) +template +void NEScaleKernel::scale_bilinear_qasymm(const Window &window) { - BorderSize border_size(1); - if(input->data_layout() == DataLayout::NHWC) + // Get data layout and width/height indices + const DataLayout data_layout = _input->info()->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(_input->info()->dimension(idx_height), _output->info()->dimension(idx_height), _align_corners); + Window win_off; + win_off.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_off.set(Window::DimY, Window::Dimension(0, 0, 0)); + + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(idx_width, Window::Dimension(0, 0, 0)); + win_in.set(idx_height, Window::Dimension(0, 0, 0)); + + for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d) { - border_size = (info.border_mode == BorderMode::CONSTANT && info.interpolation_policy == InterpolationPolicy::BILINEAR) ? BorderSize(1, 0, 0, 0) : BorderSize(0); + win_off.set(d, Window::Dimension(0, 0, 0)); } - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), - dx != nullptr ? dx->clone().get() : nullptr, - dy != nullptr ? dy->clone().get() : nullptr, - offsets != nullptr ? offsets->clone().get() : nullptr, - output->clone().get(), - info, border_size) - .first); + Iterator in(_input, win_in); + Iterator out(_output, window); + + const int32_t in_dim_w = _input->info()->dimension(idx_width); + const int32_t in_dim_h = _input->info()->dimension(idx_height); + const int32_t stride_w = _input->info()->strides_in_bytes()[idx_width]; + const int32_t stride_h = _input->info()->strides_in_bytes()[idx_height]; + + const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform(); + + if(_border_mode == BorderMode::CONSTANT) + { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + using ConstType = typename std::conditional::value, half, T>::type; +#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + using ConstType = T; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + const T const_border_value = static_cast(_constant_border_value.get()); + execute_window_loop(window, [&](const Coordinates & id) + { + const int32_t index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset); + const int32_t index_w = *(reinterpret_cast(_offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dx_val = *(reinterpret_cast(_dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dy_val = *(reinterpret_cast(_dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto pixel_row_ptr = reinterpret_cast(in.ptr()); + + const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? + (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) : + const_border_value; + const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? + (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) : + const_border_value; + const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ? + (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) : + const_border_value; + const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ? + (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) : + const_border_value; + + const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); + *reinterpret_cast(out.ptr()) = Qasymm8QuantizationHelper::quantize(compute_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + in, out); + } + else if(_border_mode == BorderMode::REPLICATE) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const int index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset); + const int32_t index_w = *(reinterpret_cast(_offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dx_val = *(reinterpret_cast(_dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dy_val = *(reinterpret_cast(_dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto pixel_row_ptr = reinterpret_cast(in.ptr()); + + auto clamped_w = utility::clamp(index_w, 0, in_dim_w - 1); + auto clamped_w1 = utility::clamp(index_w + 1, 0, in_dim_w - 1); + auto clamped_h = utility::clamp(index_h, 0, in_dim_h - 1); + auto clamped_h1 = utility::clamp(index_h + 1, 0, in_dim_h - 1); + + const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h); + const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h); + const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h); + const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h); + + const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); + *reinterpret_cast(out.ptr()) = Qasymm8QuantizationHelper::quantize(compute_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + in, out); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} +Status NEScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy, + const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, info)); return Status{}; } diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp index ae883bcb20..e4e20e041b 100644 --- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp +++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp @@ -168,7 +168,7 @@ void NEGaussianPyramidOrb::configure(const ITensor *input, IPyramid *pyramid, Bo _gaus5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value); /* Configure scale */ - _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED }); + _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED, PixelValue(), SamplingPolicy::CENTER, false }); } _tmp.allocate(); diff --git a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp index 24755fc99b..aa5f8a21ca 100644 --- a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp +++ b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp @@ -73,7 +73,7 @@ void NELaplacianReconstruct::configure(const IPyramid *pyramid, ITensor *input, // Scale levels n-1 to 1, and add levels n-2 to 0 for(size_t l = 0; l < last_level; ++l) { - _scalef[l].configure(_tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), ScaleKernelInfo{ arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value }); + _scalef[l].configure(_tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), ScaleKernelInfo{ arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value, SamplingPolicy::CENTER, false }); _addf[l].configure(_tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE); } diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp index 424049f24a..2278f07a1c 100644 --- a/src/runtime/NEON/functions/NEScale.cpp +++ b/src/runtime/NEON/functions/NEScale.cpp @@ -44,7 +44,7 @@ namespace arm_compute { namespace { -void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size, SamplingPolicy sampling_policy, bool align_corners) +void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners) { ARM_COMPUTE_ERROR_ON(nullptr == offsets); ARM_COMPUTE_UNUSED(sampling_policy); @@ -72,7 +72,7 @@ void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float const int in_xi = std::floor(in_x); const int in_yi = std::floor(in_y); - *reinterpret_cast(offsets_it.ptr()) = in_xi * static_cast(input_element_size); + *reinterpret_cast(offsets_it.ptr()) = in_xi; *reinterpret_cast(dx_it.ptr()) = in_x - in_xi; *reinterpret_cast(dy_it.ptr()) = in_y - in_yi; }, @@ -85,23 +85,17 @@ void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float execute_window_loop(win, [&](const Coordinates & id) { - const float float_in_xi = (id.x() + sampling_offset) * wr; - const auto in_xi = static_cast(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) : std::floor(float_in_xi)); - - *reinterpret_cast(offsets_it.ptr()) = in_xi * input_element_size; + const float float_in_xi = (id.x() + sampling_offset) * wr; + const auto in_xi = static_cast(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) : std::floor(float_in_xi)); + *reinterpret_cast(offsets_it.ptr()) = in_xi; }, offsets_it); } } } // namespace -NEScale::NEScale() // NOLINT - : _offsets(), - _dx(), - _dy(), - _scale_kernel(), - _border_handler(), - _use_padding(true) +NEScale::NEScale() + : _offsets(), _dx(), _dy() { } @@ -110,7 +104,6 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo & ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(NEScale::validate(input->info(), output->info(), info)); - _use_padding = info.use_padding; const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy); // Get data layout and width/height indices @@ -119,18 +112,17 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo & const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); // Get the tensor shape - const TensorShape shape(output->info()->dimension(idx_width), output->info()->dimension(idx_height)); + TensorShape shape(output->info()->dimension(idx_width)); + shape.set(1, output->info()->dimension(idx_height), false); // Compute the ratio between source width/height and destination width/height const auto wr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used); const auto hr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used); - // Get the element size of the input image - const size_t input_element_size = input->info()->element_size(); - // Area interpolation behaves as Nearest Neighbour in case of up-sampling const auto policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy; + auto scale_kernel = arm_compute::support::cpp14::make_unique(); switch(policy_to_use) { case InterpolationPolicy::NEAREST_NEIGHBOR: @@ -138,13 +130,13 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo & TensorInfo tensor_info_offsets(shape, Format::S32); _offsets.allocator()->init(tensor_info_offsets); - _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, info); + scale_kernel->configure(input, nullptr, nullptr, &_offsets, output, info); // Allocate once the configure methods have been called _offsets.allocator()->allocate(); // Pre-compute offsets for nearest interpolation - precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_element_size, info.sampling_policy, is_align_corners_used); + precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, info.sampling_policy, is_align_corners_used); break; } case InterpolationPolicy::BILINEAR: @@ -156,7 +148,7 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo & _dx.allocator()->init(tensor_info_dxdy); _dy.allocator()->init(tensor_info_dxdy); - _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, info); + scale_kernel->configure(input, &_dx, &_dy, &_offsets, output, info); // Allocate once the configure methods have been called _offsets.allocator()->allocate(); @@ -164,27 +156,18 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo & _dy.allocator()->allocate(); // Pre-compute dx, dy and offsets for bilinear interpolation - precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_element_size, info.sampling_policy, is_align_corners_used); + precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, info.sampling_policy, is_align_corners_used); break; } case InterpolationPolicy::AREA: { - _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, info); + scale_kernel->configure(input, nullptr, nullptr, nullptr, output, info); break; } default: ARM_COMPUTE_ERROR("Unsupported interpolation mode"); } - if(info.use_padding) - { - _border_handler.configure(input, _scale_kernel.border_size(), info.border_mode, info.constant_border_value); - } -} - -void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding, - bool align_corners) -{ - configure(input, output, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners }); + _kernel = std::move(scale_kernel); } Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info) @@ -225,20 +208,4 @@ Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ON_ERROR(NEScaleKernel::validate(input->clone().get(), dx, dy, offsets, output->clone().get(), info)); return Status{}; } - -Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, - BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding, bool align_corners) -{ - ARM_COMPUTE_RETURN_ON_ERROR(NEScale::validate(input, output, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners })); - return Status{}; -} - -void NEScale::run() -{ - if(_use_padding) - { - NEScheduler::get().schedule(&_border_handler, Window::DimZ); - } - NEScheduler::get().schedule(&_scale_kernel, Window::DimY); -} } // namespace arm_compute diff --git a/tests/datasets/BorderModeDataset.h b/tests/datasets/BorderModeDataset.h index 84a7a4cfb1..bb90ad2214 100644 --- a/tests/datasets/BorderModeDataset.h +++ b/tests/datasets/BorderModeDataset.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 Arm Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,8 +27,6 @@ #include "arm_compute/core/Types.h" #include "tests/framework/datasets/ContainerDataset.h" -#include "utils/TypePrinter.h" - namespace arm_compute { namespace test diff --git a/tests/validation/NEON/Scale.cpp b/tests/validation/NEON/Scale.cpp index 1b59faaf07..9a1e9b01b3 100644 --- a/tests/validation/NEON/Scale.cpp +++ b/tests/validation/NEON/Scale.cpp @@ -117,9 +117,8 @@ const auto output_shape = TensorShape{ 4, 6, 3, 2 }; constexpr auto default_data_type = DataType::U8; constexpr auto default_data_layout = DataLayout::NHWC; constexpr auto default_interpolation_policy = InterpolationPolicy::NEAREST_NEIGHBOR; -constexpr auto default_border_mode = BorderMode::UNDEFINED; +constexpr auto default_border_mode = BorderMode::CONSTANT; constexpr auto default_sampling_policy = SamplingPolicy::CENTER; -constexpr bool default_use_padding = false; TEST_CASE(NullPtr, framework::DatasetMode::ALL) { @@ -128,11 +127,11 @@ TEST_CASE(NullPtr, framework::DatasetMode::ALL) Status result{}; // nullptr is given as input - result = NEScale::validate(nullptr, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode }); + result = NEScale::validate(nullptr, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false }); ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS); // nullptr is given as output - result = NEScale::validate(&input, nullptr, ScaleKernelInfo{ default_interpolation_policy, default_border_mode }); + result = NEScale::validate(&input, nullptr, ScaleKernelInfo{ default_interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false }); ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS); } @@ -170,7 +169,7 @@ TEST_CASE(SupportDataType, framework::DatasetMode::ALL) const auto input = TensorInfo{ input_shape, 1, kv.first, default_data_layout }; const auto output = TensorInfo{ output_shape, 1, kv.first, default_data_layout }; - result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode }); + result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false }); ARM_COMPUTE_EXPECT(bool(result) == kv.second, framework::LogLevel::ERRORS); } } @@ -183,7 +182,7 @@ TEST_CASE(MissmatchingDataType, framework::DatasetMode::ALL) const auto output = TensorInfo{ output_shape, 1, non_default_data_type, default_data_layout }; Status result{}; - result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode }); + result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false }); ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS); } @@ -193,9 +192,9 @@ TEST_CASE(UsePadding, framework::DatasetMode::ALL) const auto output = TensorInfo{ output_shape, 1, default_data_type, default_data_layout }; Status result{}; - // When use padding is false, border mode should be constant - constexpr auto border_mode = BorderMode::UNDEFINED; - constexpr bool use_padding = false; + // Padding is not supported anymore + constexpr auto border_mode = BorderMode::CONSTANT; + constexpr bool use_padding = true; result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, border_mode, PixelValue(), default_sampling_policy, use_padding }); ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS); @@ -211,7 +210,7 @@ TEST_CASE(AreaWithNHWC, framework::DatasetMode::ALL) const auto output = TensorInfo{ output_shape, 1, default_data_type, data_layout }; Status result{}; - result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode }); + result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false }); ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS); } @@ -226,7 +225,7 @@ TEST_CASE(AreaWithNonU8, framework::DatasetMode::ALL) const auto output = TensorInfo{ output_shape, 1, data_type, data_layout }; Status result{}; - result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode }); + result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false }); ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS); } @@ -241,11 +240,80 @@ TEST_CASE(AlignedCornerNotSupported, framework::DatasetMode::ALL) const auto output = TensorInfo{ output_shape, 1, default_data_type, default_data_layout }; Status result{}; - result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode, PixelValue(), sampling_policy, default_use_padding, align_corners }); + result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false, align_corners }); ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS); } TEST_SUITE_END() // Validate +DATA_TEST_CASE(CheckNoPadding, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::Medium4DShapes(), + framework::dataset::make("DataType", { DataType::F32, DataType::QASYMM8 })), + framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::BILINEAR, InterpolationPolicy::NEAREST_NEIGHBOR })), + framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER, SamplingPolicy::TOP_LEFT })), + framework::dataset::make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW })), + shape, data_type, interpolation_policy, sampling_policy, data_layout) +{ + constexpr auto default_border_mode = BorderMode::CONSTANT; + ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false); + + // Create tensors + Tensor src = create_tensor(shape, data_type); + src.info()->set_data_layout(data_layout); + + const float scale_x = 0.5f; + const float scale_y = 0.5f; + TensorShape shape_scaled(shape); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + shape_scaled.set(idx_width, shape[idx_width] * scale_x, /* apply_dim_correction = */ false); + shape_scaled.set(idx_height, shape[idx_height] * scale_y, /* apply_dim_correction = */ false); + Tensor dst = create_tensor(shape_scaled, data_type); + + ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Create and configure function + NEScale scale; + scale.configure(&src, &dst, info); + + validate(src.info()->padding(), PaddingSize(0, 0, 0, 0)); + validate(dst.info()->padding(), PaddingSize(0, 0, 0, 0)); +} + +DATA_TEST_CASE(CheckNoPaddingInterpAREA, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::Medium4DShapes(), + framework::dataset::make("DataType", { DataType::U8 })), + framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::AREA })), + framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER, SamplingPolicy::TOP_LEFT })), + framework::dataset::make("DataLayout", { DataLayout::NCHW })), + shape, data_type, interpolation_policy, sampling_policy, data_layout) +{ + constexpr auto default_border_mode = BorderMode::CONSTANT; + ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false); + + // Create tensors + Tensor src = create_tensor(shape, data_type); + src.info()->set_data_layout(data_layout); + + const float scale_x = 0.5f; + const float scale_y = 0.5f; + TensorShape shape_scaled(shape); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + shape_scaled.set(idx_width, shape[idx_width] * scale_x, /* apply_dim_correction = */ false); + shape_scaled.set(idx_height, shape[idx_height] * scale_y, /* apply_dim_correction = */ false); + + Tensor dst = create_tensor(shape, data_type); + + ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Create and configure function + NEScale scale; + scale.configure(&src, &dst, info); + + validate(src.info()->padding(), PaddingSize(0, 0, 0, 0)); + validate(dst.info()->padding(), PaddingSize(0, 0, 0, 0)); +} + template using NEScaleFixture = ScaleValidationFixture; template diff --git a/tests/validation/fixtures/ScaleFixture.h b/tests/validation/fixtures/ScaleFixture.h index e2ed3ab6f9..1e66306715 100644 --- a/tests/validation/fixtures/ScaleFixture.h +++ b/tests/validation/fixtures/ScaleFixture.h @@ -137,7 +137,7 @@ protected: // Create and configure function FunctionType scale; - scale.configure(&src, &dst, ScaleKernelInfo{ _policy, _border_mode, _constant_border_value, _sampling_policy, /* use_padding */ true, _align_corners }); + scale.configure(&src, &dst, ScaleKernelInfo{ _policy, _border_mode, _constant_border_value, _sampling_policy, /* use_padding */ false, _align_corners }); ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS); ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); -- cgit v1.2.1