From 3f632f3f16e29ebeb7065b30008060fd4bfd09f1 Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Thu, 22 Aug 2019 16:52:00 +0100
Subject: COMPMID-2418: CLDequantizationLayer support for QASYMM8_PER_CHANNEL

Add support for QASYMM8_PER_CHANNEL in CLDequantiazationLayer.
Added tests for NHWC and also updated NEON code to work with NHWC
data layout.
Cleaned up the reference implementation.

Change-Id: Ic1d51f16f7f625503fffdbbb66f6487aa588f08c
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1828
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
---
 .../core/CL/kernels/CLDequantizationLayerKernel.h  |  4 ++--
 arm_compute/core/NEON/NEAsymm.h                    | 22 ++++++++++++++++++++++
 arm_compute/runtime/CL/CLTensorAllocator.h         |  4 ++--
 .../runtime/CL/functions/CLDequantizationLayer.h   |  4 ++--
 4 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'arm_compute')

diff --git a/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h b/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h
index 0ee5a13638..830d7518ce 100644
--- a/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h
@@ -48,13 +48,13 @@ public:
     ~CLDequantizationLayerKernel() = default;
     /** Set the input, output, min and max.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QSYMM8/QSYMM16.
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
      * @param[out] output Destination tensor. Data types supported: F16/F32.
      */
     void configure(const ICLTensor *input, ICLTensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDequantizationLayerKernel
      *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QSYMM8/QSYMM16.
+     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
      * @param[in] output Output tensor info. Data types supported: F16/F32.
      *
      * @return a status
diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h
index 981c7b075c..f2d20d373a 100644
--- a/arm_compute/core/NEON/NEAsymm.h
+++ b/arm_compute/core/NEON/NEAsymm.h
@@ -226,6 +226,28 @@ inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offs
     return vdequantized_input;
 }
 
+/** Dequantize following an asymmetric quantization scheme a neon vector holding 16 quantized values.
+ *
+ * @param[in] qv      Input values to be dequantized.
+ * @param[in] vscale  Vector containing quantization scaling factors.
+ * @param[in] voffset Vector containing quantization offset.
+ *
+ * @return Dequantized values in a neon vector
+ */
+inline float32x4x4_t vdequantize(const uint8x16_t &qv, const float32x4x4_t vscale, const int32x4x4_t voffset)
+{
+    const float32x4x4_t vdequantized_input =
+    {
+        {
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset.val[0])), vscale.val[0]),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset.val[1])), vscale.val[1]),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset.val[2])), vscale.val[2]),
+            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset.val[3])), vscale.val[3]),
+        }
+    };
+    return vdequantized_input;
+}
+
 /** Dequantize following a symmetric quantization scheme a neon vector holding 16 quantized values.
  *
  * @param[in] qv    Input values to be dequantized.
diff --git a/arm_compute/runtime/CL/CLTensorAllocator.h b/arm_compute/runtime/CL/CLTensorAllocator.h
index 982cc51274..f7800d39f8 100644
--- a/arm_compute/runtime/CL/CLTensorAllocator.h
+++ b/arm_compute/runtime/CL/CLTensorAllocator.h
@@ -146,8 +146,8 @@ private:
     CLMemory       _memory;                  /**< OpenCL memory */
     uint8_t       *_mapping;                 /**< Pointer to the CPU mapping of the OpenCL buffer. */
     CLTensor      *_owner;                   /**< Owner of the allocator */
-    CLFloatArray   _scale;
-    CLInt32Array   _offset;
+    CLFloatArray   _scale;                   /**< Scales array in case of quantized per channel data type */
+    CLInt32Array   _offset;                  /**< Offsets array in case of quantized per channel data type */
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLTENSORALLOCATOR_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
index ade589d79e..c519311fb1 100644
--- a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
@@ -40,13 +40,13 @@ public:
     /** Set the input and output tensors.
      *
      * @param[in]  input  Source tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches.
-     *                    Data types supported: QASYMM8/QSYMM8/QSYMM16.
+     *                    Data types supported: QASYMM8/QASYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
      * @param[out] output Destination tensor with the same dimensions of input. Data type supported: F16/F32.
      */
     void configure(const ICLTensor *input, ICLTensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDequantizationLayer
      *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QSYMM8/QSYMM16.
+     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
      * @param[in] output Output tensor info. Data type supported: F16/F32.
      *
      * @return a status
-- 
cgit v1.2.1