COMPMID-2796: Add support for QASYMM8_SIGNED in NEActivationLayer and NEPReluLayer

Change-Id: I089fd19a6beab7779d690bc9ace327f661c2753d Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com> Reviewed-on: https://review.mlplatform.org/c/2407 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
author: Michalis Spyrou <michalis.spyrou@arm.com> 2019-11-28 11:31:23 +0000
committer: Michalis Spyrou <michalis.spyrou@arm.com> 2019-12-05 11:58:51 +0000
commit: 8d4d1b85bc57d5f76f3939bb422e44df68dc2342 (patch)
tree: 8de9dd3c7bec7ea59caa4d6e70b3bbeac877c8b8 /arm_compute
parent: 25a6b67cd8188e5a968c0c89adf99f874c7eecb4 (diff)
download: ComputeLibrary-8d4d1b85bc57d5f76f3939bb422e44df68dc2342.tar.gz
6 files changed, 179 insertions, 9 deletions
diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h
index 53a3ea773f..234d48882c 100644
--- a/arm_compute/core/NEON/NEAsymm.h
+++ b/arm_compute/core/NEON/NEAsymm.h
@@ -35,6 +35,12 @@ using qasymm8x8x3_t = uint8x8x3_t; /**< 8 bit quantized asymmetric vector with 2
 using qasymm8x8x4_t = uint8x8x4_t; /**< 8 bit quantized asymmetric vector with 32 elements */
 using qasymm8x16_t  = uint8x16_t;  /**< 8 bit quantized asymmetric vector with 16 elements */
 
+using qasymm8x8_signed_t   = int8x8_t;   /**< 8 bit quantized signed asymmetric vector with 8 elements */
+using qasymm8x8x2_signed_t = int8x8x2_t; /**< 8 bit quantized signed asymmetric vector with 16 elements */
+using qasymm8x8x3_signed_t = int8x8x3_t; /**< 8 bit quantized signed asymmetric vector with 24 elements */
+using qasymm8x8x4_signed_t = int8x8x4_t; /**< 8 bit quantized signed asymmetric vector with 32 elements */
+using qasymm8x16_signed_t  = int8x16_t;  /**< 8 bit quantized signed asymmetric vector with 16 elements */
+
 /** Perform a multiply-accumulate on all 16 components of a QASYMM8 vector
  *
  * vd*vs + vo
@@ -47,6 +53,18 @@ using qasymm8x16_t  = uint8x16_t;  /**< 8 bit quantized asymmetric vector with 1
  */
 uint8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo);
 
+/** Perform a multiply-accumulate on all 16 components of a QASYMM8_SIGNED vector
+ *
+ * vd*vs + vo
+ *
+ * @param[in] vd Input vector value in QASYMM8_SIGNED format
+ * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes.
+ * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes.
+ *
+ * @return A 16-component vector in QASYMM8_SIGNED format, saturated to fit
+ */
+int8x16_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo);
+
 /** Performs final quantization step on 16 elements
  *
  * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
@@ -336,6 +354,29 @@ inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationI
     return vdequantized_input;
 }
 
+/** Dequantize a neon vector holding 8 singed quantized values.
+ *
+ * @param[in] qv Input values to be dequantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return Dequantized values in a neon vector
+ */
+inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationInfo &qi)
+{
+    const float         scale   = qi.scale;
+    const int           offset  = qi.offset;
+    const int32x4_t     voffset = vdupq_n_s32(offset);
+    const float32x4_t   vscale  = vdupq_n_f32(scale);
+    const float32x4x2_t vdequantized_input =
+    {
+        {
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale),
+        }
+    };
+    return vdequantized_input;
+}
+
 /** Dequantize a neon vector holding 16 quantized values.
  *
  * @param[in] qv Input values to be dequantized.
@@ -361,6 +402,31 @@ inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantization
     return vdequantized_input;
 }
 
+/** Dequantize a neon vector holding 16 signed quantized values.
+ *
+ * @param[in] qv Input values to be dequantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return Dequantized values in a neon vector
+ */
+inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationInfo &qi)
+{
+    const float         scale   = qi.scale;
+    const int           offset  = qi.offset;
+    const int32x4_t     voffset = vdupq_n_s32(offset);
+    const float32x4_t   vscale  = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input =
+    {
+        {
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+        }
+    };
+    return vdequantized_input;
+}
+
 /** Dequantize following an asymmetric quantization scheme a neon vector holding 16 quantized values.
  *
  * @param[in] qv     Input values to be dequantized.
@@ -456,6 +522,34 @@ inline uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInf
     return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
 }
 
+/** Quantize a neon vector holding 8 floating point values.
+ *
+ * @param[in] qv Input values to be quantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return A neon vector holding the singed quantized values
+ */
+inline int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
+{
+    const float       scale     = qi.scale;
+    const int         offset    = qi.offset;
+    const float32x4_t voffset   = vdupq_n_f32(offset);
+    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
+    const int32x4x4_t rf =
+    {
+        {
+#ifdef __aarch64__
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+#else  //__aarch64__
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+#endif //__aarch64__
+        }
+    };
+    return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
+}
+
 /** Quantize a neon vector holding 16 floating point values.
  *
  * @param[in] qv Input values to be quantized.
@@ -490,6 +584,42 @@ inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationIn
     return vcombine_u8(pa, pb);
 }
 
+/** Signed quantize a neon vector holding 16 floating point values.
+ *
+ * @param[in] qv Input values to be quantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return A neon vector holding the quantized values
+ */
+
+inline int8x16_t vquantize_signed(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
+{
+    const float       scale     = qi.scale;
+    const int         offset    = qi.offset;
+    const float32x4_t voffset   = vdupq_n_f32(offset);
+    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
+    const int32x4x4_t rf =
+    {
+        {
+#ifdef __aarch64__
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
+#else  //__aarch64__
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
+#endif //__aarch64__
+
+        }
+    };
+    const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
+    const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
+    return vcombine_s8(pa, pb);
+}
+
 /** Quantize to QASYMM16 a neon vector holding 16 floating point values.
  *
  * @param[in] qv Input values to be quantized.
diff --git a/arm_compute/core/NEON/NEAsymm.inl b/arm_compute/core/NEON/NEAsymm.inl
index a98c6aa390..71205e0403 100644
--- a/arm_compute/core/NEON/NEAsymm.inl
+++ b/arm_compute/core/NEON/NEAsymm.inl
@@ -56,4 +56,37 @@ inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t v
     // convert uint16 vectors to uint8 vectors (with saturation)
     return vcombine_u8(vqmovn_u16(vd_low_u16x8), vqmovn_u16(vd_high_u16x8));
 }
+inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo)
+{
+    // Convert uint8 vectors to int16 vectors
+    const int8x8_t vd_low        = vget_low_s8(vd);
+    const int8x8_t vd_high       = vget_high_s8(vd);
+    int16x8_t      vd_low_s16x8  = vmovl_s8(vd_low);
+    int16x8_t      vd_high_s16x8 = vmovl_s8(vd_high);
+    // Convert int16 vectors to int32 vectors
+    int32x4_t A_s32x4 = vmovl_s16(vget_low_s16(vd_low_s16x8));
+    int32x4_t B_s32x4 = vmovl_s16(vget_high_s16(vd_low_s16x8));
+    int32x4_t C_s32x4 = vmovl_s16(vget_low_s16(vd_high_s16x8));
+    int32x4_t D_s32x4 = vmovl_s16(vget_high_s16(vd_high_s16x8));
+    // Convert int32 vectors to float32 vectors
+    float32x4_t A_f32x4 = vcvtq_f32_s32(A_s32x4);
+    float32x4_t B_f32x4 = vcvtq_f32_s32(B_s32x4);
+    float32x4_t C_f32x4 = vcvtq_f32_s32(C_s32x4);
+    float32x4_t D_f32x4 = vcvtq_f32_s32(D_s32x4);
+    // vd = vd*vs + vo
+    A_f32x4 = vmlaq_f32(vo, A_f32x4, vs);
+    B_f32x4 = vmlaq_f32(vo, B_f32x4, vs);
+    C_f32x4 = vmlaq_f32(vo, C_f32x4, vs);
+    D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
+    // Convert float32 vectors to int32 vectors
+    A_s32x4 = vcvtq_s32_f32(A_f32x4);
+    B_s32x4 = vcvtq_s32_f32(B_f32x4);
+    C_s32x4 = vcvtq_s32_f32(C_f32x4);
+    D_s32x4 = vcvtq_s32_f32(D_f32x4);
+    // Convert int32 vectors to int16 vectors (with saturation)
+    vd_low_s16x8  = vcombine_s16(vqmovn_s32(A_s32x4), vqmovn_s32(B_s32x4));
+    vd_high_s16x8 = vcombine_s16(vqmovn_s32(C_s32x4), vqmovn_s32(D_s32x4));
+    // convert int16 vectors to int8 vectors (with saturation)
+    return vcombine_s8(vqmovn_s16(vd_low_s16x8), vqmovn_s16(vd_high_s16x8));
+}
 } // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
index 9f2a085b3a..82103b988b 100644
--- a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
@@ -58,7 +58,7 @@ public:
      * @note If the output tensor is a nullptr, the activation function will be performed in-place
      *
      * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                                 of the activation function. Data types supported: QASYMM8/QSYMM16/F16/F32.
+     *                                 of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
      * @param[out]     output          Destination tensor. Data type supported: same as @p input
      * @param[in]      activation_info Activation layer information.
      */
@@ -66,7 +66,7 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref NEActivationLayerKernel
      *
      * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
-     *                     of the activation function. Data types supported: QASYMM8/QSYMM16/F16/F32.
+     *                     of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
      * @param[in] output   Destination tensor info. Data type supported: same as @p input
      * @param[in] act_info Activation layer information.
      *
@@ -102,6 +102,12 @@ private:
      * @param[in] window Region on which to execute the kernel
      */
     template <ActivationLayerInfo::ActivationFunction F, typename T>
+    typename std::enable_if<std::is_same<T, qasymm8_signed_t>::value, void>::type activation(const Window &window);
+    /** Function to apply an activation function on a tensor.
+     *
+     * @param[in] window Region on which to execute the kernel
+     */
+    template <ActivationLayerInfo::ActivationFunction F, typename T>
     typename std::enable_if<std::is_same<T, qsymm16_t>::value, void>::type activation(const Window &window);
 
 private:
diff --git a/arm_compute/core/QuantizationInfo.h b/arm_compute/core/QuantizationInfo.h
index 7a6fe42098..06ba665c6b 100644
--- a/arm_compute/core/QuantizationInfo.h
+++ b/arm_compute/core/QuantizationInfo.h
@@ -33,9 +33,10 @@
 
 namespace arm_compute
 {
-using qasymm8_t  = uint8_t;  /**< 8 bit quantized asymmetric scalar value */
-using qsymm16_t  = int16_t;  /**< 16 bit quantized symmetric scalar value */
-using qasymm16_t = uint16_t; /**< 16 bit quantized asymmetric scalar value */
+using qasymm8_signed_t = int8_t;   /**< 8 bit signed quantized asymmetric scalar value */
+using qasymm8_t        = uint8_t;  /**< 8 bit quantized asymmetric scalar value */
+using qsymm16_t        = int16_t;  /**< 16 bit quantized symmetric scalar value */
+using qasymm16_t       = uint16_t; /**< 16 bit quantized asymmetric scalar value */
 
 /** Quantization info when assuming per layer quantization */
 struct UniformQuantizationInfo
diff --git a/arm_compute/runtime/NEON/functions/NEActivationLayer.h b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
index cd9b22d397..95901dc2d8 100644
--- a/arm_compute/runtime/NEON/functions/NEActivationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
@@ -59,7 +59,7 @@ public:
      * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
      *
      * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                                 of the activation function. Data types supported: QASYMM8/QSYMM16/F16/F32.
+     *                                 of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
      * @param[out]     output          Destination tensor. Data type supported: same as @p input
      * @param[in]      activation_info Activation layer parameters.
      */
@@ -68,7 +68,7 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref NEActivationLayer
      *
      * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
-     *                     of the activation function. Data types supported: QASYMM8/QSYMM16/F16/F32.
+     *                     of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
      * @param[in] output   Destination tensor info. Data type supported: same as @p input
      * @param[in] act_info Activation layer information.
      *
diff --git a/arm_compute/runtime/NEON/functions/NEPReluLayer.h b/arm_compute/runtime/NEON/functions/NEPReluLayer.h
index c0a1df472f..102a165383 100644
--- a/arm_compute/runtime/NEON/functions/NEPReluLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPReluLayer.h
@@ -40,14 +40,14 @@ class NEPReluLayer : public INESimpleFunction
 public:
     /** Set the input and output tensor.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32.
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  alpha  Source alpha tensor. Data types supported: same of @p input.
      * @param[out] output Destination tensor. Data type supported: same as @p input
      */
     void configure(const ITensor *input, const ITensor *alpha, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEPReluLayer
      *
-     * @param[in] input  Source tensor info. Data types supported: QASYMM8/F16/F32.
+     * @param[in] input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] alpha  Source alpha tensor info. Data types supported: same of @p input.
      * @param[in] output Destination tensor info. Data type supported: same as @p input
      *
author	Michalis Spyrou <michalis.spyrou@arm.com>	2019-11-28 11:31:23 +0000
committer	Michalis Spyrou <michalis.spyrou@arm.com>	2019-12-05 11:58:51 +0000
commit	8d4d1b85bc57d5f76f3939bb422e44df68dc2342 (patch)
tree	8de9dd3c7bec7ea59caa4d6e70b3bbeac877c8b8 /arm_compute
parent	25a6b67cd8188e5a968c0c89adf99f874c7eecb4 (diff)
download	ComputeLibrary-8d4d1b85bc57d5f76f3939bb422e44df68dc2342.tar.gz