COMPMID-3284 add utilities for layer normalization of NEON QLSTM

Change-Id: Ie98a8c4c30ac7859a989a29cbe7602c1c6fec26b Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/2934 Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
author: Sang-Hoon Park <sang-hoon.park@arm.com> 2020-03-26 14:02:37 +0000
committer: Sang-Hoon Park <sang-hoon.park@arm.com> 2020-03-27 13:27:45 +0000
commit: 396cb95774bd7627254e3befec5e34844de701c9 (patch)
tree: efdb87d02e398dba7440cee994e2d7a434bf51b8
parent: 1a531faf71c7563cf7b1d2e36cc4261e6a4a9906 (diff)
download: ComputeLibrary-396cb95774bd7627254e3befec5e34844de701c9.tar.gz
3 files changed, 115 insertions, 2 deletions
diff --git a/arm_compute/core/NEON/NESymm.h b/arm_compute/core/NEON/NESymm.h
index 924840930a..0cc2a963cf 100644
--- a/arm_compute/core/NEON/NESymm.h
+++ b/arm_compute/core/NEON/NESymm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NESYMM_H
 
 #include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -230,5 +231,26 @@ inline qsymm16x8x2_t vquantize_qsymm16(const float32x4x4_t &qv, const UniformQua
     return res;
 }
 
+/** Multiply a neon vector using quantized multiplier and shift
+ *
+ * @param[in] input Input vector to mutiply values to be quantized.
+ * @param[in] qmul  Quantized multipler
+ * @param[in] shift Left bit shift
+ *
+ * @return A neon vector holding the multiplied value
+ */
+inline int32x4x2_t multiply_by_quantized_multipler_2row(int32x4x2_t input, int32_t qmul, int32_t shift)
+{
+    const auto left_shift  = shift > 0 ? shift : 0;
+    const auto right_shift = shift > 0 ? 0 : -shift;
+    const auto one_shifted = 1 << left_shift;
+
+    int32x4x2_t result;
+    result.val[0] = rounding_divide_by_pow2(vqrdmulhq_n_s32(vmulq_n_s32(input.val[0], one_shifted), qmul), right_shift);
+    result.val[1] = rounding_divide_by_pow2(vqrdmulhq_n_s32(vmulq_n_s32(input.val[1], one_shifted), qmul), right_shift);
+
+    return result;
+}
+
 } // namespace arm_compute
 #endif // ARM_COMPUTE_NESYMM_H
diff --git a/arm_compute/core/utils/quantization/AsymmHelpers.h b/arm_compute/core/utils/quantization/AsymmHelpers.h
index 94876fb02f..0f0ec72b60 100644
--- a/arm_compute/core/utils/quantization/AsymmHelpers.h
+++ b/arm_compute/core/utils/quantization/AsymmHelpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -99,6 +99,46 @@ void compute_quantized_multipliers_and_shifts(const ITensorInfo *input,
                                               unsigned int       idx_ofms,
                                               int32_t           *output_multipliers_ptr,
                                               int32_t           *output_shifts_ptr);
+
+/** Round to the nearest division by a power-of-two using exponent, copied from NEMath
+ *
+ * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent
+ *
+ * @param[in] x        Element to divide.
+ * @param[in] exponent Integer value used to round to nearest division by a power-of-two
+ *
+ * @return the nearest division by a power-of-two using exponent
+ */
+int32_t rounding_divide_by_pow2(int32_t x, int exponent);
+
+/** Compute multiplication of two integers
+ *
+ * @param[in] a One integer to multiply
+ * @param[in] b Another integer to multiply
+ *
+ * @return The multiplied value
+ */
+int32_t saturating_rounding_doubling_highmul(int32_t a, int32_t b);
+
+/** Compute the value multiplied by given quantized multiplier and shift
+ *
+ * @param[in] input Target value to multiply.
+ * @param[in] qmul  Quantized multipler
+ * @param[in] shift Left bit shift
+ *
+ * @return The multiplied value
+ */
+int32_t multiply_by_quantized_multipler(int32_t input, int32_t qmul, int32_t shift);
+
+/** Compute the value multiplied the power-of-two
+ *
+ * @param[in] exponent Exponent used to calculate power-of-two
+ * @param[in] v        Target value to multiply
+ *
+ * @return The multiplied value
+ */
+int32_t saturating_rounding_multiply_by_pow2(int exponent, int32_t v);
+
 } // namespace quantization
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_IO_FILE_HANDLER_H */
diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp
index e1ba6413b4..c5eef9dd77 100644
--- a/src/core/utils/quantization/AsymmHelpers.cpp
+++ b/src/core/utils/quantization/AsymmHelpers.cpp
@@ -196,5 +196,56 @@ void compute_quantized_multipliers_and_shifts(const ITensorInfo *input,
         output_shifts_ptr[i]      = output_shift;
     }
 }
+
+int32_t saturating_rounding_doubling_highmul(int32_t a, int32_t b)
+{
+    bool    overflow = a == b && a == std::numeric_limits<int32_t>::min();
+    int64_t a_64(a);
+    int64_t b_64(b);
+    int64_t ab_64        = a_64 * b_64;
+    int32_t nudge        = ab_64 >= 0 ? (1 << 30) : (1 - (1 << 30));
+    int32_t ab_x2_high32 = static_cast<int32_t>((ab_64 + nudge) / (1ll << 31));
+    return overflow ? std::numeric_limits<int32_t>::max() : ab_x2_high32;
+}
+
+inline int32_t rounding_divide_by_pow2(int32_t x, int exponent)
+{
+    const int32_t mask      = (1 << exponent) - 1;
+    const int32_t threshold = (mask >> 1) + (x < 0 ? 1 : 0);
+    return (x >> exponent) + ((x & mask) > threshold ? 1 : 0);
+}
+
+int32_t multiply_by_quantized_multipler(int32_t input, int32_t qmul, int32_t shift)
+{
+    const auto left_shift  = shift > 0 ? shift : 0;
+    const auto right_shift = shift > 0 ? 0 : -shift;
+    return rounding_divide_by_pow2(saturating_rounding_doubling_highmul(input * (1 << left_shift), qmul), right_shift);
+}
+
+int32_t saturating_rounding_multiply_by_pow2(int32_t exponent, int32_t v)
+{
+    if(exponent == 0)
+    {
+        return v;
+    }
+    else if(exponent < 0)
+    {
+        return rounding_divide_by_pow2(v, -exponent);
+    }
+    else
+    {
+        constexpr auto min   = std::numeric_limits<int32_t>::min();
+        constexpr auto max   = std::numeric_limits<int32_t>::max();
+        const auto     width = sizeof(int32_t) * 8;
+
+        const int32_t threshold = ((1 << (width - 1 - exponent)) - 1);
+        bool          pos_mask  = v > threshold;
+        bool          neg_mask  = v < -threshold;
+        int32_t       result    = v << exponent;
+        result                  = pos_mask ? max : result;
+        result                  = neg_mask ? min : result;
+        return result;
+    }
+}
 } // quantization
 } // arm_compute
author	Sang-Hoon Park <sang-hoon.park@arm.com>	2020-03-26 14:02:37 +0000
committer	Sang-Hoon Park <sang-hoon.park@arm.com>	2020-03-27 13:27:45 +0000
commit	396cb95774bd7627254e3befec5e34844de701c9 (patch)
tree	efdb87d02e398dba7440cee994e2d7a434bf51b8
parent	1a531faf71c7563cf7b1d2e36cc4261e6a4a9906 (diff)
download	ComputeLibrary-396cb95774bd7627254e3befec5e34844de701c9.tar.gz