From c4f2743951473f8d97f5a43767fdbb31a4df967c Mon Sep 17 00:00:00 2001 From: Gunes Bayir Date: Sun, 11 Sep 2022 15:59:19 +0100 Subject: =?UTF-8?q?Optimize=20Quantized/Integer=20Bilinear=20Scale=20for?= =?UTF-8?q?=20Neon=E2=84=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces several performance optimizations regarding the Bilinear Scale operator with REPLICATE Border mode. Changes apply only to NHWC. This patch - Reduces the memory footprint by disabling precomputation of indices and weights when they're not used - Rewrites the kernels for QASYMM8/QASYMM8_SIGNED/U8(Uint8) - Adds S8(Int8) Bilinear Scale for Border mode REPLICATE - Removes Bilinear Scale SVE kernels for Quantized and Integer types and adjust the heuristics to choose the Neon™ implementation - Adds new test cases where the input and output of the Bilinear Scale operator have different quantization scale and offset Resolves: COMPMID-5453, COMPMID-5454 Change-Id: I3d251e76e0c6978fd5a0a1795ec62ab536bec93c Signed-off-by: Gunes Bayir Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8250 Reviewed-by: SiCong Li Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Benchmark: Arm Jenkins --- tests/validation/NEON/Scale.cpp | 72 ++++++++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 15 deletions(-) (limited to 'tests/validation/NEON') diff --git a/tests/validation/NEON/Scale.cpp b/tests/validation/NEON/Scale.cpp index e386d804ca..9927e21490 100644 --- a/tests/validation/NEON/Scale.cpp +++ b/tests/validation/NEON/Scale.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -22,16 +22,10 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Types.h" #include "arm_compute/runtime/NEON/functions/NEScale.h" -#include "arm_compute/runtime/Tensor.h" -#include "arm_compute/runtime/TensorAllocator.h" #include "tests/NEON/Accessor.h" -#include "tests/PaddingCalculator.h" #include "tests/datasets/ScaleValidationDataset.h" -#include "tests/framework/Asserts.h" #include "tests/framework/Macros.h" -#include "tests/validation/Helpers.h" #include "tests/validation/Validation.h" #include "tests/validation/fixtures/ScaleFixture.h" @@ -51,7 +45,7 @@ using datasets::ScaleAlignCornersSamplingPolicySet; /** We consider vector size in byte 64 since the maximum size of * a vector used by the kernel is currently 64-byte (float32x4x4). - * There are possibility to reduce test time further by using + * There is possibility to reduce test time further by using * smaller vector sizes for different data types where applicable. */ constexpr uint32_t vector_byte = 64; @@ -62,26 +56,31 @@ constexpr uint32_t num_elements_per_vector() return vector_byte / sizeof(T); } -/** Scale data types */ -const auto ScaleDataTypes = framework::dataset::make("DataType", +/** Quantization information data set */ +const auto QuantizationInfoSet = framework::dataset::make("QuantizationInfo", { - DataType::U8, - DataType::S16, - DataType::F32, + QuantizationInfo(0.5f, -10), }); /** Quantization information data set */ -const auto QuantizationInfoSet = framework::dataset::make("QuantizationInfo", +const auto InputQuantizationInfoSet = framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(0.5f, -10), }); +/** Quantization information data set */ +const auto OutputQuantizationInfoSet = framework::dataset::make("OutputQuantizationInfo", +{ + QuantizationInfo(0.2f, 20), +}); + /** Tolerance */ constexpr AbsoluteTolerance tolerance_u8(1); +constexpr AbsoluteTolerance tolerance_s8(1); constexpr AbsoluteTolerance tolerance_s16(1); RelativeTolerance tolerance_f32(0.05); #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -constexpr float abs_tolerance_f16(0.01f); +constexpr float abs_tolerance_f16(0.01f); RelativeTolerance tolerance_f16(half(0.1)); #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ @@ -321,6 +320,8 @@ using NEScaleMixedDataLayoutFixture = ScaleValidationFixture using NEScaleQuantizedFixture = ScaleValidationQuantizedFixture; template +using NEScaleDifferentOutputQuantizedFixture = ScaleValidationDifferentOutputQuantizedFixture; +template using NEScaleQuantizedMixedDataLayoutFixture = ScaleValidationQuantizedFixture; TEST_SUITE(Float) @@ -457,6 +458,27 @@ FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture, framework: validate(Accessor(_target), _reference, valid_region, tolerance_u8); } TEST_SUITE_END() // U8 +TEST_SUITE(S8) +const auto s8_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector())), framework::dataset::make("DataType", DataType::S8)); +FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture, framework::DatasetMode::ALL, ASSEMBLE_S8_DATASET(s8_shape, ScaleSamplingPolicySet)) +{ + //Create valid region + TensorInfo src_info(_shape, 1, _data_type); + ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED)); + + // Validate output + validate(Accessor(_target), _reference, valid_region, tolerance_s8); +} +FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture, framework::DatasetMode::ALL, ASSEMBLE_S8_DATASET(s8_shape, ScaleAlignCornersSamplingPolicySet)) +{ + //Create valid region + TensorInfo src_info(_shape, 1, _data_type); + ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED)); + + // Validate output + validate(Accessor(_target), _reference, valid_region, tolerance_s8); +} +TEST_SUITE_END() // S8 TEST_SUITE(S16) const auto s16_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector())), framework::dataset::make("DataType", DataType::S16)); FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture, framework::DatasetMode::ALL, ASSEMBLE_DATASET(s16_shape, ScaleSamplingPolicySet)) @@ -492,6 +514,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleQuantizedFixture, framework::Da // Validate output validate(Accessor(_target), _reference, valid_region, tolerance_u8); } +FIXTURE_DATA_TEST_CASE(RunSmallDifferentOutputQuantization, NEScaleDifferentOutputQuantizedFixture, framework::DatasetMode::ALL, + ASSEMBLE_DIFFERENTLY_QUANTIZED_DATASET(qasymm8_shape, ScaleSamplingPolicySet, InputQuantizationInfoSet, OutputQuantizationInfoSet)) +{ + //Create valid region + TensorInfo src_info(_shape, 1, _data_type); + ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED)); + + // Validate output + validate(Accessor(_target), _reference, valid_region, tolerance_u8); +} FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEScaleQuantizedMixedDataLayoutFixture, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_shape, ScaleSamplingPolicySet, QuantizationInfoSet)) { @@ -525,6 +557,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleQuantizedFixture, framework::Dat // Validate output validate(Accessor(_target), _reference, valid_region, tolerance_qasymm8_signed); } +FIXTURE_DATA_TEST_CASE(RunSmallDifferentOutputQuantization, NEScaleDifferentOutputQuantizedFixture, framework::DatasetMode::ALL, + ASSEMBLE_DIFFERENTLY_QUANTIZED_DATASET(qasymm8_signed_shape, ScaleSamplingPolicySet, InputQuantizationInfoSet, OutputQuantizationInfoSet)) +{ + //Create valid region + TensorInfo src_info(_shape, 1, _data_type); + ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED)); + + // Validate output + validate(Accessor(_target), _reference, valid_region, tolerance_qasymm8_signed); +} FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleQuantizedFixture, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_signed_shape, ScaleAlignCornersSamplingPolicySet, QuantizationInfoSet)) { -- cgit v1.2.1