aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/NEAsymm.h
diff options
context:
space:
mode:
authorSang-Hoon Park <sang-hoon.park@arm.com>2020-11-25 11:46:03 +0000
committerSang-Hoon Park <sang-hoon.park@arm.com>2020-12-02 10:18:46 +0000
commitadd8e815ea94c8f8e6b1c9faf18527695f1332ec (patch)
tree5941064344b426d12bc76b2fba3d0c631e796088 /src/core/NEON/NEAsymm.h
parent4ffc42afafc8e6eee9917ac27b4bc510973335bf (diff)
downloadComputeLibrary-add8e815ea94c8f8e6b1c9faf18527695f1332ec.tar.gz
COMPMID-3862: Add support QASYMM8 LEAKY RELU activation
- LEAKY RELU activation is supported for QASYMM8 data type - vquantize on NEON side has been modified to match with other backends (OpenCL and reference) Change-Id: I194631225c8d4f3cc96027d64812ec2be2b4328a Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4593 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-by: Manuel Bottini <manuel.bottini@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/NEAsymm.h')
-rw-r--r--src/core/NEON/NEAsymm.h85
1 files changed, 26 insertions, 59 deletions
diff --git a/src/core/NEON/NEAsymm.h b/src/core/NEON/NEAsymm.h
index 70d48d5835..9b92a865d0 100644
--- a/src/core/NEON/NEAsymm.h
+++ b/src/core/NEON/NEAsymm.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NEASYMM_H
#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
#include <arm_neon.h>
namespace arm_compute
@@ -647,35 +648,39 @@ inline int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizat
return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
}
-/** Quantize a neon vector holding 16 floating point values.
- *
- * @param[in] qv Input values to be quantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return A neon vector holding the quantized values
- */
-inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
+inline int32x4x4_t vquantize_internal(const float32x4x4_t &qv, float scale, int32_t offset)
{
- const float scale = qi.scale;
- const int offset = qi.offset;
- const float32x4_t voffset = vdupq_n_f32(offset);
+ const int32x4_t voffset = vdupq_n_s32(offset);
const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
const int32x4x4_t rf =
{
{
#ifdef __aarch64__
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
+ vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
+ vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
+ vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
+ vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
#else //__aarch64__
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
+ vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
+ vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
+ vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
+ vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
#endif //__aarch64__
}
};
+ return rf;
+}
+
+/** Quantize a neon vector holding 16 floating point values.
+ *
+ * @param[in] qv Input values to be quantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return A neon vector holding the quantized values
+ */
+inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
+{
+ auto rf = vquantize_internal(qv, qi.scale, qi.offset);
const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
return vcombine_u8(pa, pb);
@@ -690,26 +695,7 @@ inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationIn
*/
inline int8x16_t vquantize_signed(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
{
- const float scale = qi.scale;
- const int offset = qi.offset;
- const float32x4_t voffset = vdupq_n_f32(offset);
- const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
- const int32x4x4_t rf =
- {
- {
-#ifdef __aarch64__
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#else //__aarch64__
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#endif //__aarch64__
- }
- };
+ auto rf = vquantize_internal(qv, qi.scale, qi.offset);
const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
return vcombine_s8(pa, pb);
@@ -724,26 +710,7 @@ inline int8x16_t vquantize_signed(const float32x4x4_t &qv, const UniformQuantiza
*/
inline uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
{
- const float scale = qi.scale;
- const int offset = qi.offset;
- const float32x4_t voffset = vdupq_n_f32(offset);
- const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
- const int32x4x4_t rf =
- {
- {
-#ifdef __aarch64__
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#else //__aarch64__
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#endif //__aarch64__
- }
- };
+ auto rf = vquantize_internal(qv, qi.scale, qi.offset);
const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1]));
const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3]));
return { pa, pb };