From bb88f89b7a12e83eea2fc701f1f82aabf7dfcf7a Mon Sep 17 00:00:00 2001
From: SiCong Li <sicong.li@arm.com>
Date: Fri, 28 Aug 2020 11:18:47 +0100
Subject: COMPMID-3581 Add S32 support to NEPixelWiseMultiplication

* Add S32 support to NEPixelWiseMultiplication and NEPixelWiseMultiplicationKernel
    * Scale == 1/255 is not supported for S32, as on non-aarch64 the
    precision requirement is not met, and scale is a non-standard
    parameter anyway.
* Fix the data types validation logics to also test for all invalid data
    type combinations.
* Add validation tests for S32 NEON PixelWiseMultiplication
    * The wrap tolerance for ScaleOther (scale == 1/2^n) cases is set to
    1 instead of 0 because the reference uses floating point division
    followed by rounding, which is isn't bit accurate.

Change-Id: I28839afda7a4f98c985d1763620e08d98f740142
Signed-off-by: SiCong Li <sicong.li@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3923
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 tests/validation/NEON/PixelWiseMultiplication.cpp  | 31 ++++++--
 .../reference/PixelWiseMultiplication.cpp          | 87 ++++++++++++++++++++++
 2 files changed, 112 insertions(+), 6 deletions(-)

(limited to 'tests/validation')
diff --git a/tests/validation/NEON/PixelWiseMultiplication.cpp b/tests/validation/NEON/PixelWiseMultiplication.cpp
index 0b88628912..a66f6f192f 100644
--- a/tests/validation/NEON/PixelWiseMultiplication.cpp
+++ b/tests/validation/NEON/PixelWiseMultiplication.cpp
@@ -111,6 +111,8 @@ using NEPixelWiseMultiplicationToU8Fixture = PixelWiseMultiplicationValidationFi
 template <typename T>
 using NEPixelWiseMultiplicationToS16Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, int16_t>;
 template <typename T>
+using NEPixelWiseMultiplicationToS32Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, int32_t>;
+template <typename T>
 using NEPixelWiseMultiplicationToF16Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, half_float::half>;
 template <typename T>
 using NEPixelWiseMultiplicationToF32Fixture     = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, float>;
@@ -139,6 +141,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),            //11 Mismatching data type
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),            //12 Ok
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),     //13 Quantized cannot do WRAP
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),                //14 S32 does not support scale255
                                                       }),
                framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
@@ -153,6 +156,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),
                                                      })),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
@@ -160,13 +164,14 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),
                                                      })),
                framework::dataset::make("Scale",{  scale_unity,
                                                    scale_unity,
@@ -180,7 +185,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                                    scale_unity,
                                                    scale_unity,
                                                    scale_unity,
-                                                   scale_unity})),
+                                                   scale_unity,
+                                                   scale_255})),
                framework::dataset::make("OverflowPolicy",{
                                                    ConvertPolicy::WRAP,
                                                    ConvertPolicy::WRAP,
@@ -195,9 +201,10 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                                    ConvertPolicy::WRAP,
                                                    ConvertPolicy::SATURATE,
                                                    ConvertPolicy::WRAP,
+                                                   ConvertPolicy::SATURATE,
                                         })),
 
-               framework::dataset::make("Expected", { true, true, true, false, false, false, false, false, true , false, false, true, false })),
+               framework::dataset::make("Expected", { true, true, true, false, false, false, false, false, true , false, false, true, false, false})),
                input1_info, input2_info, output_info, scale, policy, expected)
 {
     bool has_error = bool(NEPixelWiseMultiplication::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), scale, policy, RoundingPolicy::TO_ZERO));
@@ -260,7 +267,7 @@ TEST_SUITE_END() // InPlaceValidate
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8_SIGNED)
-TEST_SUITE(Scale255)
+TEST_SUITE(ScaleUnity)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8SignedFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                      framework::dataset::make("DataTypeIn1", DataType::QASYMM8_SIGNED)),
                                                                                                                      framework::dataset::make("DataTypeIn2", DataType::QASYMM8_SIGNED)),
@@ -273,8 +280,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8SignedFixture,
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-TEST_SUITE_END() // Scale255
-TEST_SUITE_END() // QASYMM8
+TEST_SUITE_END() // ScaleUnity
+TEST_SUITE_END() // QASYMM8_SIGNED
 
 TEST_SUITE(QASYMM8)
 TEST_SUITE(Scale255)
@@ -476,6 +483,18 @@ TEST_SUITE_END() // ScaleOther
 
 TEST_SUITE_END() // S16toS16
 
+TEST_SUITE(S32toS32)
+
+TEST_SUITE(ScaleUnity)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS32Fixture<int32_t>, ALL, SmallShapes(), S32, S32, S32, scale_unity, TO_ZERO, InPlaceDataSet, WRAP_VALIDATE(int32_t, 1))
+TEST_SUITE_END() // ScaleUnity
+
+TEST_SUITE(ScaleOther)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS32Fixture<int32_t>, ALL, SmallShapes(), S32, S32, S32, scale_other, TO_ZERO, InPlaceDataSet, WRAP_VALIDATE(int32_t, 1))
+TEST_SUITE_END() // ScaleOther
+
+TEST_SUITE_END() // S32toS32
+
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16toF16)
 
diff --git a/tests/validation/reference/PixelWiseMultiplication.cpp b/tests/validation/reference/PixelWiseMultiplication.cpp
index 9f70b1c2af..0450991f61 100644
--- a/tests/validation/reference/PixelWiseMultiplication.cpp
+++ b/tests/validation/reference/PixelWiseMultiplication.cpp
@@ -43,6 +43,8 @@ struct is_floating_point
 
 namespace
 {
+constexpr float scale1_constant = 1.f;
+
 /** Compute the result of `src1 * src2 * scale`. The result type always matches the type of @p src2.
  *
  * @param[in] src1            An input value. Data types supported: U8/S16/F16/F32.
@@ -89,6 +91,90 @@ T3 mul(const T1 src1, const T2 src2, float scale, ConvertPolicy convert_policy,
     }
 }
 
+template <>
+int32_t mul(const int32_t src1, const int32_t src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy)
+{
+    const int64_t intermediate_val = static_cast<int64_t>(src1) * static_cast<int64_t>(src2);
+
+    if(std::abs(scale - scale1_constant) < 0.00001f)
+    {
+        // Use bit-accurate integer arithmetic for scale == 1
+        // Apply conversion
+        if(convert_policy == ConvertPolicy::SATURATE)
+        {
+            return saturate_cast<int32_t>(intermediate_val);
+        }
+        else
+        {
+            // Correct wrapping behaviour for int32_t
+            const auto i32_hi              = static_cast<int64_t>(std::numeric_limits<int32_t>::max());
+            const auto i32_lo              = static_cast<int64_t>(std::numeric_limits<int32_t>::lowest());
+            const auto i32_wi              = static_cast<int64_t>(1) << 32;
+            int64_t    wrapped_rounded_val = intermediate_val - i32_wi * static_cast<int64_t>(support::cpp11::trunc(static_cast<double>(intermediate_val) / i32_wi));
+            if(wrapped_rounded_val <= i32_hi)
+            {
+                return static_cast<int32_t>(wrapped_rounded_val);
+            }
+            else
+            {
+                // Values beyond i32_hi wrap around to negatives
+                return static_cast<int32_t>((wrapped_rounded_val - i32_hi) + i32_lo - 1);
+            }
+        }
+    }
+    else
+    {
+        // Use double arithmetic for scale != 1; may not be bit-accurate
+        // Apply scaling
+        // scale == 1 / 2^scale_exponent
+        int scale_exponent = 0;
+        std::frexp(scale, &scale_exponent);
+        // Store the positive exponent. We know that we compute 1/2^n
+        // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
+        scale_exponent         = std::abs(scale_exponent - 1);
+        const double scale_inv = static_cast<int64_t>(1) << scale_exponent;
+        const double val       = intermediate_val / scale_inv;
+        // Apply rounding
+        double rounded_val = 0;
+        switch(rounding_policy)
+        {
+            case(RoundingPolicy::TO_ZERO):
+                rounded_val = support::cpp11::trunc(val);
+                break;
+            case(RoundingPolicy::TO_NEAREST_UP):
+                rounded_val = round_half_up(val);
+                break;
+            case(RoundingPolicy::TO_NEAREST_EVEN):
+                rounded_val = round_half_even(val);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Unsupported rounding policy");
+        }
+        // Apply conversion
+        if(convert_policy == ConvertPolicy::SATURATE)
+        {
+            return saturate_cast<int32_t>(rounded_val);
+        }
+        else
+        {
+            // Correct wrapping behaviour for int32_t
+            const auto i32_hi              = static_cast<double>(std::numeric_limits<int32_t>::max());
+            const auto i32_lo              = static_cast<double>(std::numeric_limits<int32_t>::lowest());
+            const auto i32_wi              = static_cast<double>(static_cast<int64_t>(1) << 32);
+            double     wrapped_rounded_val = rounded_val - i32_wi * std::floor(rounded_val / i32_wi);
+            if(wrapped_rounded_val <= i32_hi)
+            {
+                return static_cast<int32_t>(wrapped_rounded_val);
+            }
+            else
+            {
+                // Values beyond i32_hi wrap around to negatives
+                return static_cast<int32_t>((wrapped_rounded_val - i32_hi) + i32_lo - 1);
+            }
+        }
+    }
+}
+
 template <size_t dim>
 struct BroadcastUnroll
 {
@@ -264,6 +350,7 @@ SimpleTensor<int16_t> pixel_wise_multiplication(const SimpleTensor<int16_t> &src
 // clang-format off
 template SimpleTensor<int16_t> pixel_wise_multiplication(const SimpleTensor<uint8_t> &src1, const SimpleTensor<int16_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, DataType dt_out, const QuantizationInfo &qout);
 template SimpleTensor<int32_t> pixel_wise_multiplication(const SimpleTensor<int16_t> &src1, const SimpleTensor<int16_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, DataType dt_out, const QuantizationInfo &qout);
+template SimpleTensor<int32_t> pixel_wise_multiplication(const SimpleTensor<int32_t> &src1, const SimpleTensor<int32_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, DataType dt_out, const QuantizationInfo &qout);
 template SimpleTensor<float> pixel_wise_multiplication(const SimpleTensor<float> &src1, const SimpleTensor<float> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, DataType dt_out, const QuantizationInfo &qout);
 template SimpleTensor<half_float::half> pixel_wise_multiplication(const SimpleTensor<half_float::half> &src1, const SimpleTensor<half_float::half> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, DataType dt_out, const QuantizationInfo &qout);
 // clang-format on
-- 
cgit v1.2.1