From 0bc80daf319ea3219ca6a6fa200118dc859ee460 Mon Sep 17 00:00:00 2001
From: morgolock <pablo.tello@arm.com>
Date: Mon, 10 Aug 2020 16:44:18 +0100
Subject: MLCE-229: Support for negative shifts in asm kernels

Change-Id: I2c5e98aae7698963f106d7423df0e65cd00ee2a9
Signed-off-by: morgolock <pablo.tello@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3710
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Sheri Zhang <sheri.zhang@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 .../NEON/functions/NEGEMMAssemblyDispatch.cpp      | 31 +++++++++++++++-------
 .../functions/NEGEMMLowpMatrixMultiplyCore.cpp     | 29 +++-----------------
 2 files changed, 26 insertions(+), 34 deletions(-)

(limited to 'src/runtime')
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index 3b9dde2bf7..eeea3a45ee 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -182,8 +182,8 @@ public:
       *
       * @return A tuple with the pointers to the shift and multiplier data respectively
       */
-    std::tuple<const int32_t *, const int32_t *> set_requantize_data(const std::vector<int32_t> &shifts,
-                                                                     const std::vector<int32_t> &multipliers);
+    std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> set_requantize_data(const std::vector<int32_t> &shifts,
+                                                                                            const std::vector<int32_t> &multipliers);
 
     // Inherited methods overridden:
     void run() override;
@@ -235,18 +235,29 @@ private:
     arm_gemm::KernelDescription _kernel_info{};
     /** Per channel quantization shifts */
     std::vector<int32_t> _shifts{};
+    std::vector<int32_t> right_shifts{};
+    std::vector<int32_t> left_shifts{};
     /** Per channel quantization multipliers */
     std::vector<int32_t> _multipliers{};
 };
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
-std::tuple<const int32_t *, const int32_t *> Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts,
-                                                                                                               const std::vector<int32_t> &multipliers)
+std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts,
+        const std::vector<int32_t> &multipliers)
 {
-    _multipliers = multipliers;
-    _shifts      = shifts;
-    std::transform(_shifts.begin(), _shifts.end(), _shifts.begin(), std::negate<int32_t>());
-    return std::make_tuple(_shifts.data(), _multipliers.data());
+    _multipliers   = multipliers;
+    _shifts        = shifts;
+    bool need_left = false;
+    for(const auto s : _shifts)
+    {
+        left_shifts.push_back(std::max(-s, int32_t(0)));
+        right_shifts.push_back(std::min(-s, int32_t(0)));
+        if(s > 0 && !need_left)
+        {
+            need_left = true;
+        }
+    }
+    return std::make_tuple(need_left, left_shifts.data(), right_shifts.data(), _multipliers.data());
 }
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
@@ -498,7 +509,9 @@ void create_arm_gemm_quant(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &a
         const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
         gemm_requant_info          = arm_gemm::Requantize32(nullptr, 0,
                                                             a_offset, b_offset, os_info.gemmlowp_offset,
-                                                            std::get<0>(requantize_data), std::get<1>(requantize_data),
+                                                            (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr,
+                                                            std::get<2>(requantize_data),
+                                                            std::get<3>(requantize_data),
                                                             os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
     }
     else
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index dada6d16da..83db146a8a 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -117,18 +117,8 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
         {
             if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
             {
-                // Result shifts < 0 are not supported by asm kernels
-                const std::vector<int32_t> &shifts           = info.gemmlowp_output_stage().gemmlowp_shifts;
-                const bool                  is_asm_supported = info.gemmlowp_output_stage().gemmlowp_shift >= 0
-                                                               && std::all_of(shifts.cbegin(), shifts.cend(), [](int32_t val)
-                {
-                    return val >= 0;
-                });
-                if(is_asm_supported)
-                {
-                    _asm_glue.configure(a_to_use, b, c, output, gemm_info);
-                    _fused_assembly_path = _asm_glue.is_configured();
-                }
+                _asm_glue.configure(a_to_use, b, c, output, gemm_info);
+                _fused_assembly_path = _asm_glue.is_configured();
             }
             else
             {
@@ -339,19 +329,8 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
     bool run_optimised_requantized = false;
     if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
-        // Result shifts < 0 are not supported by asm kernels
-        const std::vector<int32_t> &shifts           = info.gemmlowp_output_stage().gemmlowp_shifts;
-        const bool                  is_asm_supported = info.gemmlowp_output_stage().gemmlowp_shift >= 0
-                                                       && std::all_of(shifts.cbegin(), shifts.cend(), [](int32_t val)
-        {
-            return val >= 0;
-        });
-
-        if(is_asm_supported)
-        {
-            run_optimised             = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
-            run_optimised_requantized = run_optimised;
-        }
+        run_optimised             = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
+        run_optimised_requantized = run_optimised;
     }
     else
     {
-- 
cgit v1.2.1