From a668f9f8a4eab405df0fe8dd58e7d9425bcf9640 Mon Sep 17 00:00:00 2001
From: Jonathan Deakin <jonathan.deakin@arm.com>
Date: Wed, 24 Jan 2024 09:15:38 +0000
Subject: Add s8f32 kernels and dynamic QuantizationInfo

- Add support for QASYMM_SIGNED*QASYMM8_SIGNED->F32 in
  CpuGemmLowpMatrixMultiplyCore
- Add s8f32 kernel using existing s8->s32 kernels with a new
  DequantizeFloat OutputStage, the structure is similar to Requantize32
  but the opposite way around.
- Add SME s8f32 kernels with integrated support for DequantizeFloat.
- Add scale to CpuGemmLowpOffsetContributionKernel.
- Add virtual dequantize scale to gemm_common, only implemented for
  gemm_interleaved.
- Update year to 2024 in generate_build_files.
- Add dynamic flag to QuantizationInfo which signals to operators that
  it can change after configuration
- Add support for dynamic quantization in NEGEMMLowpMatrixMultiplyCore
- Add dynamic quantization fixture by extending
  GEMMLowpGenericMatrixMultiplyCoreValidationFixture
- Add GEMMLowpDequantizedMatrixMultiplyValidationFixture
- Store k (number of cols of A) rather than k_offset in the offset
  contribution kernels so that we can recompute it when the other
  offsets change

relates to: ONCPUML-1444 MLINFSW-439

Co-authored-by: Milos Puzovic <Milos.Puzovic@arm.com>
Co-authored-by: David Mansell <David.Mansell@arm.com>
Change-Id: I58a3acf2c09289a303e52eea6b336a696a5bc8da
Signed-off-by: Jonathan Deakin <jonathan.deakin@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11022
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
---
 tests/validation/NEON/GEMMLowp.cpp               |  34 +++++++
 tests/validation/fixtures/GEMMLowpFixture.h      | 121 +++++++++++++++++++----
 tests/validation/reference/QuantizationLayer.cpp |  12 ++-
 3 files changed, 146 insertions(+), 21 deletions(-)

(limited to 'tests')
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index 514d156983..1231c21ddb 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -53,6 +53,7 @@ namespace
 {
     constexpr AbsoluteTolerance<float> tolerance_batched(1);
     constexpr AbsoluteTolerance<float> tolerance_quant(1);
+    constexpr AbsoluteTolerance<float> tolerance_dequantized(0.01f);
 } // namespace
 
 
@@ -62,6 +63,11 @@ TEST_SUITE(MatrixMultiplyCore)
 
 using NEGEMMLowpMatrixMultiplyCoreFixture = GEMMLowpMatrixMultiplyCoreValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
 using NEGEMMLowpMatrixMultiplyCoreAccumulateFixture = GEMMLowpMatrixMultiplyAccumulateValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
+using NEGEMMLowpBatchedMatMulFixture      = GEMMLowpMatrixMultiplyCoreValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, true>;
+using NEGEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture = GEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
+using NEGEMMLowpDequantizedMatrixMultiplyValidationFixture = GEMMLowpDequantizedMatrixMultiplyValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
+
+using framework::dataset::make;
 
 DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::concat(datasets::SmallGEMMLowpDataset(), datasets::LargeGEMMLowpDataset()),
                shape_a, shape_b, shape_c, a_offset, b_offset)
@@ -337,6 +343,34 @@ TEST_SUITE_END() // S32
 TEST_SUITE_END() // ACCUMULATION
 #endif // __arch64__
 
+TEST_SUITE(DynamicQuantization)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // DynamicQuantization
+
+TEST_SUITE(Dequant)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_dequantized);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_dequantized);
+}
+TEST_SUITE_END() // Dequant
+
 TEST_SUITE_END() // MatrixMultiplyCore
 TEST_SUITE_END() // GEMMLowp
 TEST_SUITE_END() // NEON
diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h
index 0f6908a468..11a491faa7 100644
--- a/tests/validation/fixtures/GEMMLowpFixture.h
+++ b/tests/validation/fixtures/GEMMLowpFixture.h
@@ -31,6 +31,7 @@
 #include "tests/validation/Validation.h"
 #include "tests/validation/reference/GEMMLowp.h"
 #include "tests/validation/reference/ArithmeticOperations.h"
+#include "tests/validation/reference/QuantizationLayer.h"
 
 #include <cstdint>
 #include <vector>
@@ -57,11 +58,21 @@ void fill_quantized(U &&tensor, int i)
 }
 
 template <typename U>
-void fill_s32(U &&tensor, int i, int32_t min, int32_t max)
+void fill(U &&tensor, int i, int32_t min, int32_t max)
 {
-    ARM_COMPUTE_ASSERT(tensor.data_type() == DataType::S32);
-    std::uniform_int_distribution<int32_t> distribution(min, max);
-    library->fill(tensor, distribution, i);
+    if (tensor.data_type() == DataType::S32) {
+        std::uniform_int_distribution<int32_t> distribution(min, max);
+        library->fill(tensor, distribution, i);
+    }
+    else if(tensor.data_type() == DataType::F32)
+    {
+        std::uniform_real_distribution<float> distribution((float)min, (float)max);
+        library->fill(tensor, distribution, i);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("NOT SUPPORTED!");
+    }
 }
 
 /** Information about how to fill tensors */
@@ -83,22 +94,25 @@ template <typename TensorType, typename AccessorType, typename FunctionType, boo
 TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo,
                                    const QuantizationInfo& output_qinfo, DataType data_type_a = DataType::QASYMM8, DataType data_type_b = DataType::QASYMM8,
                                    GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo(), bool reshape_b_only_on_first_run = false, const TensorFillInfo& finfo = TensorFillInfo(),
-                                   bool accumulate = false)
+                                   bool accumulate = false, bool dynamic_qinfo = false, DataType data_type_output = DataType::UNKNOWN)
 {
     ARM_COMPUTE_ASSERT(is_data_type_quantized_asymmetric(data_type_a));
     ARM_COMPUTE_ASSERT(data_type_a == data_type_b);
-    // Create tensors
-    const DataType data_type_output = output_stage.type == GEMMLowpOutputStageType::NONE ? DataType::S32 : data_type_a;
+    // If unknown, set to sensible defaults
+    if (data_type_output == DataType::UNKNOWN) {
+        data_type_output = output_stage.type == GEMMLowpOutputStageType::NONE ? DataType::S32 : data_type_a;
+    }
 
-    TensorType a      = create_tensor<TensorType>(shape_a, data_type_a, 1, a_qinfo);
-    TensorType b      = create_tensor<TensorType>(shape_b, data_type_b, 1, b_qinfo); // gemm output before output stage mismatch if i pass data_layout_output here. to be investigated
+    // Create tensors
+    TensorType a      = create_tensor<TensorType>(shape_a, data_type_a, 1, dynamic_qinfo ? QuantizationInfo(1.0,0,true) : a_qinfo);
+    TensorType b      = create_tensor<TensorType>(shape_b, data_type_b, 1, dynamic_qinfo ? QuantizationInfo(1.0,0,true) : b_qinfo); // gemm output before output stage mismatch if i pass data_layout_output here. to be investigated
     TensorType output = create_tensor<TensorType>(shape_output, data_type_output, 1, output_qinfo /* output_qinfo will be ignored when output stage type is None */);
 
     TensorType bias;
     if(is_fused)
     {
         TensorShape bias_shape(shape_b[0]);
-        bias = create_tensor<TensorType>(bias_shape, DataType::S32, 1);
+        bias = create_tensor<TensorType>(bias_shape,data_type_output == DataType::F32 ? DataType::F32 : DataType::S32, 1);
     }
 
     // Create and configure function
@@ -109,6 +123,13 @@ TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape
                                                                              arm_compute::ActivationLayerInfo(), false /* fixed_format */, arm_compute::WeightFormat::UNSPECIFIED,
                                                                              false /* pretranspose_B */, accumulate));
 
+    // If the QuantizationInfo is dynamic, it needs to be settable after configure (note that we also force it to be dynamic)
+    if (dynamic_qinfo)
+    {
+        a.info()->set_quantization_info(QuantizationInfo(a_qinfo.scale(), a_qinfo.offset(), true));
+        b.info()->set_quantization_info(QuantizationInfo(b_qinfo.scale(), b_qinfo.offset(), true));
+    }
+
     ARM_COMPUTE_ASSERT(a.info()->is_resizable());
     ARM_COMPUTE_ASSERT(b.info()->is_resizable());
     ARM_COMPUTE_ASSERT(output.info()->is_resizable());
@@ -131,7 +152,7 @@ TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape
     if (accumulate)
     {
         ARM_COMPUTE_ASSERT(accumulate != run_twice);
-        fill_s32(AccessorType(output), 6 + finfo.hash, finfo.min_output, finfo.max_output);
+        fill(AccessorType(output), 6 + finfo.hash, finfo.min_output, finfo.max_output);
     }
 
     if(is_fused)
@@ -139,7 +160,7 @@ TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape
         ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
         bias.allocator()->allocate();
         ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
-        fill_s32(AccessorType(bias), 2 + finfo.hash, finfo.min_bias, finfo.max_bias);
+        fill(AccessorType(bias), 2 + finfo.hash, finfo.min_bias, finfo.max_bias);
     }
 
     // Run with variable inputs.
@@ -150,7 +171,7 @@ TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape
         fill_quantized(AccessorType(b), 4 + finfo.hash);
         if(is_fused)
         {
-            fill_s32(AccessorType(bias), 5 + finfo.hash, finfo.min_bias, finfo.max_bias);
+            fill(AccessorType(bias), 5 + finfo.hash, finfo.min_bias, finfo.max_bias);
         }
     }
 
@@ -225,21 +246,20 @@ template <typename TensorType, typename AccessorType, typename FunctionType, boo
 class GEMMLowpGenericMatrixMultiplyCoreValidationFixture : public framework::Fixture
 {
 public:
-    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset, bool accumulate=false)
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset, bool accumulate=false, bool dynamic_qinfo = false)
     {
         const auto a_qinfo = QuantizationInfo(1.0f / 255, a_offset);
         const auto b_qinfo = QuantizationInfo(1.0f / 255, b_offset);
         TensorFillInfo finfo;
-        _target    = compute_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate);
+        _target    = compute_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate, dynamic_qinfo);
         _reference = compute_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate);
     }
 
 protected:
-    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, const bool accumulate)
+    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, const bool accumulate, const bool dynamic_qinfo)
     {
         const auto output_qinfo = QuantizationInfo(); // No output stage
-        return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, int32_t, false, run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo,
-        DataType::QASYMM8, DataType::QASYMM8, GEMMLowpOutputStageInfo(), false, finfo, accumulate);
+        return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, int32_t, false, run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo, DataType::QASYMM8, DataType::QASYMM8, GEMMLowpOutputStageInfo(), false, finfo, accumulate, dynamic_qinfo);
     }
 
     SimpleTensor<int32_t> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, bool accumulate)
@@ -250,7 +270,7 @@ protected:
         if (accumulate)
         {
             SimpleTensor<int32_t> output{ shape_output, DataType::S32, 1 };
-            fill_s32(output, 6 + finfo.hash, finfo.min_output, finfo.max_output);
+            fill(output, 6 + finfo.hash, finfo.min_output, finfo.max_output);
             reference::arithmetic_operation<int32_t>(reference::ArithmeticOperation::ADD, output, ref_output, output, ConvertPolicy::SATURATE);
             return output;
         }
@@ -282,6 +302,16 @@ public:
     }
 };
 
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, bool run_twice = false>
+class GEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture : protected GEMMLowpGenericMatrixMultiplyCoreValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, run_twice>
+{
+public:
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset)
+    {
+        GEMMLowpGenericMatrixMultiplyCoreValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, run_twice>::setup(shape_a, shape_b, shape_output, a_offset, b_offset, false /* accumulate */, true /* dynamic_qinfo */);
+    }
+};
+
 template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, typename TI = uint8_t, typename TW = uint8_t, bool run_twice = false>
 class GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture : public framework::Fixture
 {
@@ -417,7 +447,7 @@ protected:
 
         TensorShape           bias_shape(shape_b[0]);
         SimpleTensor<int32_t> bias{ bias_shape, DataType::S32, 1 };
-        (run_twice) ? fill_s32(bias, 5 + finfo.hash, finfo.min_bias, finfo.max_bias) : fill_s32(bias, 2 + finfo.hash, finfo.min_bias, finfo.max_bias); // Fill bias with same seed as last run of gemmlowp_target
+        (run_twice) ? fill(bias, 5 + finfo.hash, finfo.min_bias, finfo.max_bias) : fill(bias, 2 + finfo.hash, finfo.min_bias, finfo.max_bias); // Fill bias with same seed as last run of gemmlowp_target
 
         switch(output_stage.type)
         {
@@ -438,6 +468,57 @@ protected:
     SimpleTensor<TI> _reference{};
 };
 
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, bool run_twice = false>
+class GEMMLowpDequantizedMatrixMultiplyValidationFixture : public framework::Fixture
+{
+public:
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset)
+    {
+        // Accumulation is supported for Int8/UInt8 only in aarch64
+        bool accumulate = true;
+        // Accumulation is not supported for Int8/UInt8 in aarch32
+#ifdef __arm__
+        accumulate = false;
+#endif //__arm__
+        bool dynamic_qinfo = false;
+        const auto a_qinfo = QuantizationInfo(1.0f / 255, a_offset);
+        const auto b_qinfo = QuantizationInfo(5.0f / 255, b_offset);
+        TensorFillInfo finfo;
+        _target    = compute_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate, dynamic_qinfo);
+        _reference = compute_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate);
+    }
+
+protected:
+    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, const bool accumulate, const bool dynamic_qinfo)
+    {
+        const auto output_qinfo = QuantizationInfo();
+        return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, int32_t, false, run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo, DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, GEMMLowpOutputStageInfo(), false, finfo, accumulate, dynamic_qinfo, DataType::F32);
+    }
+
+    SimpleTensor<float> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, bool accumulate)
+    {
+        SimpleTensor<int32_t> s32_ref_output =  compute_gemmlowp_reference<reinterpret_input_as_3d, int8_t, int8_t, false, false, run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo,
+        DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, finfo);
+
+        SimpleTensor<float> f32_ref_output(s32_ref_output.shape(), DataType::F32);
+        QuantizationInfo dst_quant_info = QuantizationInfo(a_qinfo.uniform().scale * b_qinfo.uniform().scale, 0);
+        f32_ref_output = reference::quantization_layer<int32_t, float>(s32_ref_output, DataType::F32, dst_quant_info);
+
+        if (accumulate)
+        {
+            SimpleTensor<float> output{ shape_output, DataType::F32, 1 };
+            fill(output, 6 + finfo.hash, finfo.min_output, finfo.max_output);
+            reference::arithmetic_operation<float>(reference::ArithmeticOperation::ADD, output, f32_ref_output, output, ConvertPolicy::SATURATE);
+            return output;
+        }
+
+        return f32_ref_output;
+    }
+
+    TensorType            _target{};
+    SimpleTensor<float> _reference{};
+};
+
 template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, typename TI = uint8_t, typename TW = uint8_t, bool run_twice = false>
 class GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture : public GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, TI, TW, run_twice>
 {
diff --git a/tests/validation/reference/QuantizationLayer.cpp b/tests/validation/reference/QuantizationLayer.cpp
index 27665375c3..b76263bf95 100644
--- a/tests/validation/reference/QuantizationLayer.cpp
+++ b/tests/validation/reference/QuantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -80,6 +80,15 @@ SimpleTensor<Tout> quantization_layer(const SimpleTensor<Tin> &src, DataType out
                 dst[i] = quantize_qasymm16((src[i]), qinfo, rounding_policy);
             }
             break;
+        case DataType::F32:
+#if defined(_OPENMP)
+            #pragma omp parallel for
+#endif /* _OPENMP */
+            for(int i = 0; i < src.num_elements(); ++i)
+            {
+                dst[i] = dequantize_s32((src[i]), qinfo);
+            }
+            break;
         default:
             ARM_COMPUTE_ERROR("Unsupported output data type");
     }
@@ -127,6 +136,7 @@ template SimpleTensor<uint8_t> quantization_layer(const SimpleTensor<half> &src,
 template SimpleTensor<uint8_t> quantization_layer(const SimpleTensor<float> &src, DataType output_data_type, const QuantizationInfo &quantization_info);
 template SimpleTensor<uint16_t> quantization_layer(const SimpleTensor<half> &src, DataType output_data_type, const QuantizationInfo &quantization_info);
 template SimpleTensor<uint16_t> quantization_layer(const SimpleTensor<float> &src, DataType output_data_type, const QuantizationInfo &quantization_info);
+template SimpleTensor<float> quantization_layer(const SimpleTensor<int32_t> &src, DataType output_data_type, const QuantizationInfo &quantization_info);
 } // namespace reference
 } // namespace validation
 } // namespace test
-- 
cgit v1.2.1