From 3f632f3f16e29ebeb7065b30008060fd4bfd09f1 Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Thu, 22 Aug 2019 16:52:00 +0100 Subject: COMPMID-2418: CLDequantizationLayer support for QASYMM8_PER_CHANNEL Add support for QASYMM8_PER_CHANNEL in CLDequantiazationLayer. Added tests for NHWC and also updated NEON code to work with NHWC data layout. Cleaned up the reference implementation. Change-Id: Ic1d51f16f7f625503fffdbbb66f6487aa588f08c Signed-off-by: Michalis Spyrou Reviewed-on: https://review.mlplatform.org/c/1828 Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Reviewed-by: Georgios Pinitas --- .../core/CL/kernels/CLDequantizationLayerKernel.h | 4 ++-- arm_compute/core/NEON/NEAsymm.h | 22 ++++++++++++++++++++++ arm_compute/runtime/CL/CLTensorAllocator.h | 4 ++-- .../runtime/CL/functions/CLDequantizationLayer.h | 4 ++-- 4 files changed, 28 insertions(+), 6 deletions(-) (limited to 'arm_compute') diff --git a/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h b/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h index 0ee5a13638..830d7518ce 100644 --- a/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h @@ -48,13 +48,13 @@ public: ~CLDequantizationLayerKernel() = default; /** Set the input, output, min and max. * - * @param[in] input Source tensor. Data types supported: QASYMM8/QSYMM8/QSYMM16. + * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_PER_CHANNEL/QSYMM8/QSYMM16. * @param[out] output Destination tensor. Data types supported: F16/F32. */ void configure(const ICLTensor *input, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLDequantizationLayerKernel * - * @param[in] input Input tensor info. Data types supported: QASYMM8/QSYMM8/QSYMM16. + * @param[in] input Input tensor info. Data types supported: QASYMM8/QASYMM8_PER_CHANNEL/QSYMM8/QSYMM16. * @param[in] output Output tensor info. Data types supported: F16/F32. * * @return a status diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h index 981c7b075c..f2d20d373a 100644 --- a/arm_compute/core/NEON/NEAsymm.h +++ b/arm_compute/core/NEON/NEAsymm.h @@ -226,6 +226,28 @@ inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offs return vdequantized_input; } +/** Dequantize following an asymmetric quantization scheme a neon vector holding 16 quantized values. + * + * @param[in] qv Input values to be dequantized. + * @param[in] vscale Vector containing quantization scaling factors. + * @param[in] voffset Vector containing quantization offset. + * + * @return Dequantized values in a neon vector + */ +inline float32x4x4_t vdequantize(const uint8x16_t &qv, const float32x4x4_t vscale, const int32x4x4_t voffset) +{ + const float32x4x4_t vdequantized_input = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset.val[0])), vscale.val[0]), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset.val[1])), vscale.val[1]), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset.val[2])), vscale.val[2]), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset.val[3])), vscale.val[3]), + } + }; + return vdequantized_input; +} + /** Dequantize following a symmetric quantization scheme a neon vector holding 16 quantized values. * * @param[in] qv Input values to be dequantized. diff --git a/arm_compute/runtime/CL/CLTensorAllocator.h b/arm_compute/runtime/CL/CLTensorAllocator.h index 982cc51274..f7800d39f8 100644 --- a/arm_compute/runtime/CL/CLTensorAllocator.h +++ b/arm_compute/runtime/CL/CLTensorAllocator.h @@ -146,8 +146,8 @@ private: CLMemory _memory; /**< OpenCL memory */ uint8_t *_mapping; /**< Pointer to the CPU mapping of the OpenCL buffer. */ CLTensor *_owner; /**< Owner of the allocator */ - CLFloatArray _scale; - CLInt32Array _offset; + CLFloatArray _scale; /**< Scales array in case of quantized per channel data type */ + CLInt32Array _offset; /**< Offsets array in case of quantized per channel data type */ }; } // namespace arm_compute #endif /* __ARM_COMPUTE_CLTENSORALLOCATOR_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h index ade589d79e..c519311fb1 100644 --- a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h +++ b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h @@ -40,13 +40,13 @@ public: /** Set the input and output tensors. * * @param[in] input Source tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches. - * Data types supported: QASYMM8/QSYMM8/QSYMM16. + * Data types supported: QASYMM8/QASYMM8_PER_CHANNEL/QSYMM8/QSYMM16. * @param[out] output Destination tensor with the same dimensions of input. Data type supported: F16/F32. */ void configure(const ICLTensor *input, ICLTensor *output); /** Static function to check if given info will lead to a valid configuration of @ref CLDequantizationLayer * - * @param[in] input Input tensor info. Data types supported: QASYMM8/QSYMM8/QSYMM16. + * @param[in] input Input tensor info. Data types supported: QASYMM8/QASYMM8_PER_CHANNEL/QSYMM8/QSYMM16. * @param[in] output Output tensor info. Data type supported: F16/F32. * * @return a status -- cgit v1.2.1