From c4f2743951473f8d97f5a43767fdbb31a4df967c Mon Sep 17 00:00:00 2001 From: Gunes Bayir Date: Sun, 11 Sep 2022 15:59:19 +0100 Subject: =?UTF-8?q?Optimize=20Quantized/Integer=20Bilinear=20Scale=20for?= =?UTF-8?q?=20Neon=E2=84=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces several performance optimizations regarding the Bilinear Scale operator with REPLICATE Border mode. Changes apply only to NHWC. This patch - Reduces the memory footprint by disabling precomputation of indices and weights when they're not used - Rewrites the kernels for QASYMM8/QASYMM8_SIGNED/U8(Uint8) - Adds S8(Int8) Bilinear Scale for Border mode REPLICATE - Removes Bilinear Scale SVE kernels for Quantized and Integer types and adjust the heuristics to choose the Neon™ implementation - Adds new test cases where the input and output of the Bilinear Scale operator have different quantization scale and offset Resolves: COMPMID-5453, COMPMID-5454 Change-Id: I3d251e76e0c6978fd5a0a1795ec62ab536bec93c Signed-off-by: Gunes Bayir Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8250 Reviewed-by: SiCong Li Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Benchmark: Arm Jenkins --- src/cpu/kernels/scale/sve/qasymm8.cpp | 111 ++-------------------------------- 1 file changed, 5 insertions(+), 106 deletions(-) (limited to 'src/cpu/kernels/scale/sve/qasymm8.cpp') diff --git a/src/cpu/kernels/scale/sve/qasymm8.cpp b/src/cpu/kernels/scale/sve/qasymm8.cpp index 09ef00a783..d45a69e43b 100644 --- a/src/cpu/kernels/scale/sve/qasymm8.cpp +++ b/src/cpu/kernels/scale/sve/qasymm8.cpp @@ -83,108 +83,6 @@ void qasymm8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor * }, out); } - -void qasymm8_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) -{ - // Data layout is NHWC - const int idx_width = 1; - const int idx_height = 2; - - // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), align_corners); - Window win_off; - win_off.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_off.set(Window::DimY, Window::Dimension(0, 0, 0)); - - // Don't increment in X and Y direction for the input tensor - // A pointer to the start of this plane is needed as base for the precomputed offsets - Window win_in(window); - win_in.set(idx_width, Window::Dimension(0, 0, 0)); - win_in.set(idx_height, Window::Dimension(0, 0, 0)); - - for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) - { - win_off.set(d, Window::Dimension(0, 0, 0)); - } - - Iterator in(src, win_in); - Iterator out(dst, window); - - const int32_t in_dim_w = src->info()->dimension(idx_width); - const int32_t in_dim_h = src->info()->dimension(idx_height); - const int32_t stride_w = src->info()->strides_in_bytes()[idx_width]; - const int32_t stride_h = src->info()->strides_in_bytes()[idx_height]; - - const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); - - if(border_mode == BorderMode::CONSTANT) - { - const uint8_t const_border_value = static_cast(constant_border_value.get()); - execute_window_loop(window, [&](const Coordinates & id) - { - const int32_t index_h = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset); - const int32_t index_w = *(reinterpret_cast(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dx_val = *(reinterpret_cast(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dy_val = *(reinterpret_cast(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto pixel_row_ptr = reinterpret_cast(in.ptr()); - - const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? - (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) : - const_border_value; - const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? - (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) : - const_border_value; - const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) : - const_border_value; - const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) : - const_border_value; - - const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); - const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); - const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); - const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); - *reinterpret_cast(out.ptr()) = Qasymm8QuantizationHelper::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); - }, - in, out); - } - else if(border_mode == BorderMode::REPLICATE) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const int index_h = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset); - const int32_t index_w = *(reinterpret_cast(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dx_val = *(reinterpret_cast(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dy_val = *(reinterpret_cast(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto pixel_row_ptr = reinterpret_cast(in.ptr()); - - auto clamped_w = utility::clamp(index_w, 0, in_dim_w - 1); - auto clamped_w1 = utility::clamp(index_w + 1, 0, in_dim_w - 1); - auto clamped_h = utility::clamp(index_h, 0, in_dim_h - 1); - auto clamped_h1 = utility::clamp(index_h + 1, 0, in_dim_h - 1); - - const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h); - const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h); - const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h); - const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h); - - const float inp00 = Qasymm8QuantizationHelper::dequantize(a00, iq_info); - const float inp01 = Qasymm8QuantizationHelper::dequantize(a01, iq_info); - const float inp10 = Qasymm8QuantizationHelper::dequantize(a10, iq_info); - const float inp11 = Qasymm8QuantizationHelper::dequantize(a11, iq_info); - *reinterpret_cast(out.ptr()) = Qasymm8QuantizationHelper::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); - }, - in, out); - } - else - { - ARM_COMPUTE_ERROR("Not implemented"); - } -} } namespace cpu { @@ -192,13 +90,14 @@ void qasymm8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window) { - if(policy == InterpolationPolicy::BILINEAR) + ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value); + if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) { - qasymm8_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + qasymm8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + else { - qasymm8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); + ARM_COMPUTE_ERROR("Not Implemented"); } } } // namespace cpu -- cgit v1.2.1