From c4f2743951473f8d97f5a43767fdbb31a4df967c Mon Sep 17 00:00:00 2001
From: Gunes Bayir <gunes.bayir@arm.com>
Date: Sun, 11 Sep 2022 15:59:19 +0100
Subject: =?UTF-8?q?Optimize=20Quantized/Integer=20Bilinear=20Scale=20for?=
 =?UTF-8?q?=20Neon=E2=84=A2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces several performance optimizations regarding the Bilinear Scale operator with REPLICATE Border mode. Changes apply only to NHWC.

This patch
   - Reduces the memory footprint by disabling precomputation of indices and weights when they're not used
   - Rewrites the kernels for QASYMM8/QASYMM8_SIGNED/U8(Uint8)
   - Adds S8(Int8) Bilinear Scale for Border mode REPLICATE
   - Removes Bilinear Scale SVE kernels for Quantized and Integer types and adjust the heuristics to choose the Neon™ implementation
   - Adds new test cases where the input and output of the Bilinear Scale operator have different quantization scale and offset

Resolves: COMPMID-5453, COMPMID-5454

Change-Id: I3d251e76e0c6978fd5a0a1795ec62ab536bec93c
Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8250
Reviewed-by: SiCong Li <sicong.li@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
---
 tests/validation/NEON/Scale.cpp | 72 ++++++++++++++++++++++++++++++++---------
 1 file changed, 57 insertions(+), 15 deletions(-)

(limited to 'tests/validation/NEON')
diff --git a/tests/validation/NEON/Scale.cpp b/tests/validation/NEON/Scale.cpp
index e386d804ca..9927e21490 100644
--- a/tests/validation/NEON/Scale.cpp
+++ b/tests/validation/NEON/Scale.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,16 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEScale.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
 #include "tests/datasets/ScaleValidationDataset.h"
-#include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
-#include "tests/validation/Helpers.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ScaleFixture.h"
 
@@ -51,7 +45,7 @@ using datasets::ScaleAlignCornersSamplingPolicySet;
 
 /** We consider vector size in byte 64 since the maximum size of
  * a vector used by the kernel is currently 64-byte (float32x4x4).
- * There are possibility to reduce test time further by using
+ * There is possibility to reduce test time further by using
  * smaller vector sizes for different data types where applicable.
  */
 constexpr uint32_t vector_byte = 64;
@@ -62,26 +56,31 @@ constexpr uint32_t num_elements_per_vector()
     return vector_byte / sizeof(T);
 }
 
-/** Scale data types */
-const auto ScaleDataTypes = framework::dataset::make("DataType",
+/** Quantization information data set */
+const auto QuantizationInfoSet = framework::dataset::make("QuantizationInfo",
 {
-    DataType::U8,
-    DataType::S16,
-    DataType::F32,
+    QuantizationInfo(0.5f, -10),
 });
 
 /** Quantization information data set */
-const auto QuantizationInfoSet = framework::dataset::make("QuantizationInfo",
+const auto InputQuantizationInfoSet = framework::dataset::make("InputQuantizationInfo",
 {
     QuantizationInfo(0.5f, -10),
 });
 
+/** Quantization information data set */
+const auto OutputQuantizationInfoSet = framework::dataset::make("OutputQuantizationInfo",
+{
+    QuantizationInfo(0.2f, 20),
+});
+
 /** Tolerance */
 constexpr AbsoluteTolerance<uint8_t> tolerance_u8(1);
+constexpr AbsoluteTolerance<int8_t>  tolerance_s8(1);
 constexpr AbsoluteTolerance<int16_t> tolerance_s16(1);
 RelativeTolerance<float>             tolerance_f32(0.05);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-constexpr float          abs_tolerance_f16(0.01f);
+constexpr float         abs_tolerance_f16(0.01f);
 RelativeTolerance<half> tolerance_f16(half(0.1));
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
@@ -321,6 +320,8 @@ using NEScaleMixedDataLayoutFixture = ScaleValidationFixture<Tensor, Accessor, N
 template <typename T>
 using NEScaleQuantizedFixture = ScaleValidationQuantizedFixture<Tensor, Accessor, NEScale, T>;
 template <typename T>
+using NEScaleDifferentOutputQuantizedFixture = ScaleValidationDifferentOutputQuantizedFixture<Tensor, Accessor, NEScale, T>;
+template <typename T>
 using NEScaleQuantizedMixedDataLayoutFixture = ScaleValidationQuantizedFixture<Tensor, Accessor, NEScale, T, true>;
 
 TEST_SUITE(Float)
@@ -457,6 +458,27 @@ FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<uint8_t>, framework:
     validate(Accessor(_target), _reference, valid_region, tolerance_u8);
 }
 TEST_SUITE_END() // U8
+TEST_SUITE(S8)
+const auto s8_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<int8_t>())), framework::dataset::make("DataType", DataType::S8));
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<int8_t>, framework::DatasetMode::ALL, ASSEMBLE_S8_DATASET(s8_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_s8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<int8_t>, framework::DatasetMode::ALL, ASSEMBLE_S8_DATASET(s8_shape, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_s8);
+}
+TEST_SUITE_END() // S8
 TEST_SUITE(S16)
 const auto s16_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<int16_t>())), framework::dataset::make("DataType", DataType::S16));
 FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<int16_t>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(s16_shape, ScaleSamplingPolicySet))
@@ -492,6 +514,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleQuantizedFixture<uint8_t>, framework::Da
     // Validate output
     validate(Accessor(_target), _reference, valid_region, tolerance_u8);
 }
+FIXTURE_DATA_TEST_CASE(RunSmallDifferentOutputQuantization, NEScaleDifferentOutputQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+                       ASSEMBLE_DIFFERENTLY_QUANTIZED_DATASET(qasymm8_shape, ScaleSamplingPolicySet, InputQuantizationInfoSet, OutputQuantizationInfoSet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_u8);
+}
 FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEScaleQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_shape, ScaleSamplingPolicySet,
                        QuantizationInfoSet))
 {
@@ -525,6 +557,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleQuantizedFixture<int8_t>, framework::Dat
     // Validate output
     validate(Accessor(_target), _reference, valid_region, tolerance_qasymm8_signed);
 }
+FIXTURE_DATA_TEST_CASE(RunSmallDifferentOutputQuantization, NEScaleDifferentOutputQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+                       ASSEMBLE_DIFFERENTLY_QUANTIZED_DATASET(qasymm8_signed_shape, ScaleSamplingPolicySet, InputQuantizationInfoSet, OutputQuantizationInfoSet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_qasymm8_signed);
+}
 FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleQuantizedFixture<int8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_signed_shape, ScaleAlignCornersSamplingPolicySet,
                        QuantizationInfoSet))
 {
-- 
cgit v1.2.1