From 81f0d15d6840a0ae8ef571114555a26da74c4a43 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Tue, 11 Jul 2017 15:00:52 +0100 Subject: COMPMID-444: Add support for QS8/QS16 NEON Arithmetic Add/Sub/Mul. Change-Id: Ia482498688ca1884272b5062e3415e736e03d36f Reviewed-on: http://mpd-gerrit.cambridge.arm.com/80448 Reviewed-by: Georgios Pinitas Tested-by: Kaizen --- arm_compute/core/NEON/NEFixedPoint.h | 8 ++++++++ arm_compute/core/NEON/NEFixedPoint.inl | 5 +++++ arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h | 12 ++++++------ .../core/NEON/kernels/NEArithmeticSubtractionKernel.h | 12 ++++++------ .../core/NEON/kernels/NEPixelWiseMultiplicationKernel.h | 7 ++++--- 5 files changed, 29 insertions(+), 15 deletions(-) (limited to 'arm_compute/core/NEON') diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h index 09579f9120..50463b5efe 100644 --- a/arm_compute/core/NEON/NEFixedPoint.h +++ b/arm_compute/core/NEON/NEFixedPoint.h @@ -145,6 +145,14 @@ qint8x16_t vld1q_dup_qs8(const qint8_t *addr); */ qint16x8_t vld1q_dup_qs16(const qint16_t *addr); +/** Load two 16 bit fixed point vectors from memory (8x2 elements) + * + * @param[in] addr Memory address of the 16 bit fixed point vectors to load + * + * @return 16 bit fixed point vectors (8x2 elements) + */ +qint16x8x2_t vld2q_qs16(qint16_t *addr); + /** Store a single 8 bit fixed point vector to memory (8 elements) * * @param[in] addr Memory address where the 8 bit fixed point vector should be stored diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl index 4e862ba387..05e481561d 100644 --- a/arm_compute/core/NEON/NEFixedPoint.inl +++ b/arm_compute/core/NEON/NEFixedPoint.inl @@ -181,6 +181,11 @@ inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr) return vld1q_dup_s16(addr); } +inline qint16x8x2_t vld2q_qs16(const qint16_t *addr) +{ + return vld2q_s16(addr); +} + inline void vst1_qs8(qint8_t *addr, qint8x8_t b) { vst1_s8(addr, b); diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h index 9bfdde1616..7ad5893b70 100644 --- a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h +++ b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h @@ -50,9 +50,9 @@ public: /** Initialise the kernel's input, output and border mode. * - * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32 - * @param[in] input2 An input tensor. Data types supported: U8/S16/F16 (only if @p input1 is F16)/F32 (only if @p input1 is F32). - * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F16 (only if both inputs are F16), F32 (only if both inputs are F32). + * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8, QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). + * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32). * @param[in] policy Overflow policy. */ void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy); @@ -63,9 +63,9 @@ public: private: /** Common signature for all the specialised add functions * - * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32 - * @param[in] input2 An input tensor. Data types supported: U8/S16/F16 (only if @p input1 is F16)/F32 (only if @p input1 is F32). - * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F16 (only if both inputs are F16), F32 (only if both inputs are F32). + * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8, QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). + * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32). * @param[in] window Region on which to execute the kernel. */ using AddFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window); diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h index 0eb9c23686..6f88d2757a 100644 --- a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h +++ b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h @@ -50,9 +50,9 @@ public: /** Initialise the kernel's input, output and border mode. * - * @param[in] input1 An input tensor. Data types supported: U8/S16/F32 - * @param[in] input2 An input tensor. Data types supported: U8/S16/F32 (only if @p input1 is F32). - * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F32 (only if both inputs are F32). + * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F32 + * @param[in] input2 An input tensor. Data types supported: U8, QS8 (only if @p input1 is QS8),QS16 (only if @p input1 is QS16), S16/F32 (only if @p input1 is F32). + * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F32 (only if both inputs are F32). * @param[in] policy Overflow policy. */ void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy); @@ -63,9 +63,9 @@ public: private: /** Common signature for all the specialised sub functions * - * @param[in] input1 An input tensor. Data types supported: U8, S16, F32. - * @param[in] input2 An input tensor. Data types supported: U8, S16, F32 (only if @p input1 is F32). - * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16, F32 (only if both inputs are F32) + * @param[in] input1 An input tensor. Data types supported: U8/S16/F32 + * @param[in] input2 An input tensor. Data types supported: U8/S16/F32 (only if @p input1 is F32). + * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F32 (only if both inputs are F32). * @param[in] window Region on which to execute the kernel. */ using SubFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window); diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h index 433a20e48e..bf96c9026c 100644 --- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h +++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h @@ -51,10 +51,11 @@ public: * * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. + * For QS8/QS16 scale = 1 is the only supported value. * - * @param[in] input1 An input tensor. Data types supported: U8/QS8/S16/F16/F32. - * @param[in] input2 An input tensor. Data types supported: U8/QS8/S16/F16/F32. - * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8) /S16/F16/F32. + * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8, QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). + * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32). * @param[in] scale Scale to apply after multiplication. * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. * @param[in] overflow_policy Overflow policy. -- cgit v1.2.1