aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorSang-Hoon Park <sang-hoon.park@arm.com>2020-11-25 11:46:03 +0000
committerSang-Hoon Park <sang-hoon.park@arm.com>2020-12-02 10:18:46 +0000
commitadd8e815ea94c8f8e6b1c9faf18527695f1332ec (patch)
tree5941064344b426d12bc76b2fba3d0c631e796088 /src
parent4ffc42afafc8e6eee9917ac27b4bc510973335bf (diff)
downloadComputeLibrary-add8e815ea94c8f8e6b1c9faf18527695f1332ec.tar.gz
COMPMID-3862: Add support QASYMM8 LEAKY RELU activation
- LEAKY RELU activation is supported for QASYMM8 data type - vquantize on NEON side has been modified to match with other backends (OpenCL and reference) Change-Id: I194631225c8d4f3cc96027d64812ec2be2b4328a Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4593 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-by: Manuel Bottini <manuel.bottini@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/core/CL/kernels/CLActivationLayerKernel.cpp10
-rw-r--r--src/core/NEON/NEAsymm.h85
-rw-r--r--src/core/NEON/kernels/NEActivationLayerKernel.cpp5
-rw-r--r--src/core/NEON/kernels/activation/impl/qasymm8_neon_activation.cpp81
-rw-r--r--src/core/NEON/kernels/activation/impl/qasymm8_signed_neon_activation.cpp81
-rw-r--r--src/core/NEON/wrapper/intrinsics/cgtz.h62
-rw-r--r--src/core/NEON/wrapper/intrinsics/intrinsics.h1
7 files changed, 227 insertions, 98 deletions
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index 9f9538cb76..00b6829e91 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -53,14 +53,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
ActivationLayerInfo::ActivationFunction::LOGISTIC,
ActivationLayerInfo::ActivationFunction::TANH,
- ActivationLayerInfo::ActivationFunction::HARD_SWISH
+ ActivationLayerInfo::ActivationFunction::HARD_SWISH,
+ ActivationLayerInfo::ActivationFunction::LEAKY_RELU,
};
const DataType data_type = input->data_type();
const QuantizationInfo &oq_info = (output != nullptr) ? output->quantization_info() : input->quantization_info();
const ActivationLayerInfo::ActivationFunction f_act = act_info.activation();
ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(data_type) && (quantized_supported_activations.count(f_act) == 0),
- "For Quantized data type only tanh, logistic, relu and lower/upper bounded relu are supported");
+ "For Quantized data type only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported");
ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 128)));
ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, 0)));
@@ -112,7 +113,10 @@ void CLActivationLayerKernel::configure(const CLCompileContext &compile_context,
const ActivationLayerInfo::ActivationFunction f_act = act_info.activation();
const bool is_quantized = is_data_type_quantized(dt);
const bool perform_activation_in_float =
- (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) || (f_act == ActivationLayerInfo::ActivationFunction::TANH) || (f_act == ActivationLayerInfo::ActivationFunction::HARD_SWISH);
+ (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ || (f_act == ActivationLayerInfo::ActivationFunction::TANH)
+ || (f_act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+ || (f_act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU);
// Set build options
CLBuildOptions build_opts;
diff --git a/src/core/NEON/NEAsymm.h b/src/core/NEON/NEAsymm.h
index 70d48d5835..9b92a865d0 100644
--- a/src/core/NEON/NEAsymm.h
+++ b/src/core/NEON/NEAsymm.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NEASYMM_H
#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
#include <arm_neon.h>
namespace arm_compute
@@ -647,35 +648,39 @@ inline int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizat
return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
}
-/** Quantize a neon vector holding 16 floating point values.
- *
- * @param[in] qv Input values to be quantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return A neon vector holding the quantized values
- */
-inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
+inline int32x4x4_t vquantize_internal(const float32x4x4_t &qv, float scale, int32_t offset)
{
- const float scale = qi.scale;
- const int offset = qi.offset;
- const float32x4_t voffset = vdupq_n_f32(offset);
+ const int32x4_t voffset = vdupq_n_s32(offset);
const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
const int32x4x4_t rf =
{
{
#ifdef __aarch64__
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
+ vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
+ vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
+ vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
+ vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
#else //__aarch64__
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
+ vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
+ vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
+ vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
+ vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
#endif //__aarch64__
}
};
+ return rf;
+}
+
+/** Quantize a neon vector holding 16 floating point values.
+ *
+ * @param[in] qv Input values to be quantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return A neon vector holding the quantized values
+ */
+inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
+{
+ auto rf = vquantize_internal(qv, qi.scale, qi.offset);
const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
return vcombine_u8(pa, pb);
@@ -690,26 +695,7 @@ inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationIn
*/
inline int8x16_t vquantize_signed(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
{
- const float scale = qi.scale;
- const int offset = qi.offset;
- const float32x4_t voffset = vdupq_n_f32(offset);
- const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
- const int32x4x4_t rf =
- {
- {
-#ifdef __aarch64__
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#else //__aarch64__
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#endif //__aarch64__
- }
- };
+ auto rf = vquantize_internal(qv, qi.scale, qi.offset);
const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
return vcombine_s8(pa, pb);
@@ -724,26 +710,7 @@ inline int8x16_t vquantize_signed(const float32x4x4_t &qv, const UniformQuantiza
*/
inline uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
{
- const float scale = qi.scale;
- const int offset = qi.offset;
- const float32x4_t voffset = vdupq_n_f32(offset);
- const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
- const int32x4x4_t rf =
- {
- {
-#ifdef __aarch64__
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#else //__aarch64__
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#endif //__aarch64__
- }
- };
+ auto rf = vquantize_internal(qv, qi.scale, qi.offset);
const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1]));
const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3]));
return { pa, pb };
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index 51257cb96b..d969fd8e38 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -110,7 +110,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
ActivationLayerInfo::ActivationFunction::LOGISTIC,
ActivationLayerInfo::ActivationFunction::TANH,
- ActivationLayerInfo::ActivationFunction::HARD_SWISH
+ ActivationLayerInfo::ActivationFunction::HARD_SWISH,
+ ActivationLayerInfo::ActivationFunction::LEAKY_RELU,
};
const static std::set<ActivationLayerInfo::ActivationFunction> qsymm16_supported_activations =
{
@@ -123,7 +124,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation();
ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_asymmetric(data_type) && (qasymm8_supported_activations.count(f_act) == 0),
- "For QASYMM8 only tanh, logistic, relu and lower/upper bounded relu are supported");
+ "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) && (qsymm16_supported_activations.count(f_act) == 0),
"For QSYMM16 only tanh and logistic are supported");
diff --git a/src/core/NEON/kernels/activation/impl/qasymm8_neon_activation.cpp b/src/core/NEON/kernels/activation/impl/qasymm8_neon_activation.cpp
index 8a398fb531..7b26441824 100644
--- a/src/core/NEON/kernels/activation/impl/qasymm8_neon_activation.cpp
+++ b/src/core/NEON/kernels/activation/impl/qasymm8_neon_activation.cpp
@@ -51,23 +51,26 @@ void qasymm8_neon_activation(const ITensor *src, ITensor *dst, const ActivationL
Iterator input(src, win_collapsed);
Iterator output(dst, win_collapsed);
- const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
- const qasymm8x16_t va = vdupq_n_u8(quantize_qasymm8(act_info.a(), qi_in));
- const qasymm8x16_t vb = vdupq_n_u8(quantize_qasymm8(act_info.b(), qi_in));
- const qasymm8_t a = quantize_qasymm8(act_info.a(), qi_in);
- const qasymm8_t b = quantize_qasymm8(act_info.b(), qi_in);
- const qasymm8_t const_0 = quantize_qasymm8(0.f, qi_in);
- const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0);
- const auto vconst_1 = vdupq_n_f32(1.f);
- const float32x4_t va_f32 = vdupq_n_f32(act_info.a());
- const float32x4_t vb_f32 = vdupq_n_f32(act_info.b());
- const float a_f32 = act_info.a();
- const float b_f32 = act_info.b();
- const auto const_6_f32 = vdupq_n_f32(6.f);
- const auto const_0_f32 = vdupq_n_f32(0.f);
- const auto const_3_f32 = vdupq_n_f32(3.f);
- const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f);
+ const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
+ const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
+ const qasymm8x16_t va = vdupq_n_u8(quantize_qasymm8(act_info.a(), qi_in));
+ const qasymm8x16_t vb = vdupq_n_u8(quantize_qasymm8(act_info.b(), qi_in));
+ const qasymm8_t a = quantize_qasymm8(act_info.a(), qi_in);
+ const qasymm8_t b = quantize_qasymm8(act_info.b(), qi_in);
+ const qasymm8_t const_0 = quantize_qasymm8(0.f, qi_in);
+ const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0);
+ const auto vconst_1 = vdupq_n_f32(1.f);
+#ifndef __aarch64__
+ const auto vconst_0_f32 = vdupq_n_f32(0);
+#endif // __aarch64__
+ const float32x4_t va_f32 = vdupq_n_f32(act_info.a());
+ const float32x4_t vb_f32 = vdupq_n_f32(act_info.b());
+ const float a_f32 = act_info.a();
+ const float b_f32 = act_info.b();
+ const auto const_6_f32 = vdupq_n_f32(6.f);
+ const auto const_0_f32 = vdupq_n_f32(0.f);
+ const auto const_3_f32 = vdupq_n_f32(3.f);
+ const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f);
// Initialise scale/offset for re-quantization
float s = qi_in.scale / qi_out.scale;
@@ -159,6 +162,44 @@ void qasymm8_neon_activation(const ITensor *src, ITensor *dst, const ActivationL
// Re-quantize to new output space
tmp = vquantize(tmp_dep, qi_out);
}
+ else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+ {
+ const auto vin_deq = vdequantize(vin, qi_in);
+
+#ifdef __aarch64__
+ const uint32x4x4_t pos_mask =
+ {
+ {
+ wrapper::vcgtz(vin_deq.val[0]),
+ wrapper::vcgtz(vin_deq.val[1]),
+ wrapper::vcgtz(vin_deq.val[2]),
+ wrapper::vcgtz(vin_deq.val[3]),
+ }
+ };
+#else // __aarch64__
+ const uint32x4x4_t pos_mask =
+ {
+ {
+ wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
+ wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
+ wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
+ wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
+ }
+ };
+#endif // __aarch64__
+
+ const float32x4x4_t tmp_dep =
+ {
+ {
+ wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
+ wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
+ wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
+ wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
+ }
+ };
+
+ tmp = vquantize(tmp_dep, qi_out);
+ }
else
{
ARM_COMPUTE_ERROR("Unsupported activation function");
@@ -204,6 +245,12 @@ void qasymm8_neon_activation(const ITensor *src, ITensor *dst, const ActivationL
tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
tmp = quantize_qasymm8(tmp_f, qi_out);
}
+ else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+ {
+ float tmp_f = dequantize_qasymm8(in, qi_in);
+ tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
+ tmp = quantize_qasymm8(tmp_f, qi_out);
+ }
else
{
ARM_COMPUTE_ERROR("Unsupported activation function");
diff --git a/src/core/NEON/kernels/activation/impl/qasymm8_signed_neon_activation.cpp b/src/core/NEON/kernels/activation/impl/qasymm8_signed_neon_activation.cpp
index bfab07c8e3..c616c5e27d 100644
--- a/src/core/NEON/kernels/activation/impl/qasymm8_signed_neon_activation.cpp
+++ b/src/core/NEON/kernels/activation/impl/qasymm8_signed_neon_activation.cpp
@@ -50,23 +50,26 @@ void qasymm8_signed_neon_activation(const ITensor *src, ITensor *dst, const Acti
Iterator input(src, win_collapsed);
Iterator output(dst, win_collapsed);
- const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
- const qasymm8x16_signed_t va = vdupq_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in));
- const qasymm8x16_signed_t vb = vdupq_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in));
- const qasymm8_signed_t a = quantize_qasymm8_signed(act_info.a(), qi_in);
- const qasymm8_signed_t b = quantize_qasymm8_signed(act_info.b(), qi_in);
- const qasymm8_signed_t const_0 = quantize_qasymm8_signed(0.f, qi_in);
- const qasymm8x16_signed_t vconst_0 = vdupq_n_s8(const_0);
- const auto vconst_1 = vdupq_n_f32(1.f);
- const float32x4_t va_f32 = vdupq_n_f32(act_info.a());
- const float32x4_t vb_f32 = vdupq_n_f32(act_info.b());
- const float a_f32 = act_info.a();
- const float b_f32 = act_info.b();
- const auto const_6_f32 = vdupq_n_f32(6.f);
- const auto const_0_f32 = vdupq_n_f32(0.f);
- const auto const_3_f32 = vdupq_n_f32(3.f);
- const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f);
+ const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
+ const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
+ const qasymm8x16_signed_t va = vdupq_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in));
+ const qasymm8x16_signed_t vb = vdupq_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in));
+ const qasymm8_signed_t a = quantize_qasymm8_signed(act_info.a(), qi_in);
+ const qasymm8_signed_t b = quantize_qasymm8_signed(act_info.b(), qi_in);
+ const qasymm8_signed_t const_0 = quantize_qasymm8_signed(0.f, qi_in);
+ const qasymm8x16_signed_t vconst_0 = vdupq_n_s8(const_0);
+ const auto vconst_1 = vdupq_n_f32(1.f);
+#ifndef __aarch64__
+ const auto vconst_0_f32 = vdupq_n_f32(1.f);
+#endif // __aarch64__
+ const float32x4_t va_f32 = vdupq_n_f32(act_info.a());
+ const float32x4_t vb_f32 = vdupq_n_f32(act_info.b());
+ const float a_f32 = act_info.a();
+ const float b_f32 = act_info.b();
+ const auto const_6_f32 = vdupq_n_f32(6.f);
+ const auto const_0_f32 = vdupq_n_f32(0.f);
+ const auto const_3_f32 = vdupq_n_f32(3.f);
+ const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f);
// Initialise scale/offset for re-quantization
float s = qi_in.scale / qi_out.scale;
@@ -158,6 +161,44 @@ void qasymm8_signed_neon_activation(const ITensor *src, ITensor *dst, const Acti
// Re-quantize to new output space
tmp = vquantize_signed(tmp_dep, qi_out);
}
+ else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+ {
+ const auto vin_deq = vdequantize(vin, qi_in);
+
+#ifdef __aarch64__
+ const uint32x4x4_t pos_mask =
+ {
+ {
+ wrapper::vcgtz(vin_deq.val[0]),
+ wrapper::vcgtz(vin_deq.val[1]),
+ wrapper::vcgtz(vin_deq.val[2]),
+ wrapper::vcgtz(vin_deq.val[3]),
+ }
+ };
+#else // __aarch64__
+ const uint32x4x4_t pos_mask =
+ {
+ {
+ wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
+ wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
+ wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
+ wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
+ }
+ };
+#endif // __aarch64__
+
+ const float32x4x4_t tmp_dep =
+ {
+ {
+ wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
+ wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
+ wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
+ wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
+ }
+ };
+
+ tmp = vquantize_signed(tmp_dep, qi_out);
+ }
else
{
ARM_COMPUTE_ERROR("Unsupported activation function");
@@ -203,6 +244,12 @@ void qasymm8_signed_neon_activation(const ITensor *src, ITensor *dst, const Acti
tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
tmp = quantize_qasymm8_signed(tmp_f, qi_out);
}
+ else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+ {
+ float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+ tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
+ tmp = quantize_qasymm8_signed(tmp_f, qi_out);
+ }
else
{
ARM_COMPUTE_ERROR("Unsupported activation function");
diff --git a/src/core/NEON/wrapper/intrinsics/cgtz.h b/src/core/NEON/wrapper/intrinsics/cgtz.h
new file mode 100644
index 0000000000..025a7ba976
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/cgtz.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_CGTZ_H
+#define ARM_COMPUTE_WRAPPER_CGTZ_H
+
+#ifdef __aarch64__
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VCGTZ_IMPL(vtype, rtype, prefix, postfix) \
+ inline rtype vcgtz(const vtype &a) \
+ { \
+ return prefix##_##postfix(a); \
+ }
+
+VCGTZ_IMPL(int8x8_t, uint8x8_t, vcgtz, s8)
+VCGTZ_IMPL(int16x4_t, uint16x4_t, vcgtz, s16)
+VCGTZ_IMPL(int32x2_t, uint32x2_t, vcgtz, s32)
+VCGTZ_IMPL(float32x2_t, uint32x2_t, vcgtz, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCGTZ_IMPL(float16x4_t, uint16x4_t, vcgtz, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VCGTZ_IMPL(int8x16_t, uint8x16_t, vcgtzq, s8)
+VCGTZ_IMPL(int16x8_t, uint16x8_t, vcgtzq, s16)
+VCGTZ_IMPL(int32x4_t, uint32x4_t, vcgtzq, s32)
+VCGTZ_IMPL(float32x4_t, uint32x4_t, vcgtzq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCGTZ_IMPL(float16x8_t, uint16x8_t, vcgtzq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VCGTZ_IMPL
+
+} // namespace wrapper
+} // namespace arm_compute
+
+#endif // __aarch64__
+#endif /* ARM_COMPUTE_WRAPPER_CGTZ_H */
diff --git a/src/core/NEON/wrapper/intrinsics/intrinsics.h b/src/core/NEON/wrapper/intrinsics/intrinsics.h
index 070f3c7065..c6bad3f9dd 100644
--- a/src/core/NEON/wrapper/intrinsics/intrinsics.h
+++ b/src/core/NEON/wrapper/intrinsics/intrinsics.h
@@ -31,6 +31,7 @@
#include "src/core/NEON/wrapper/intrinsics/ceq.h"
#include "src/core/NEON/wrapper/intrinsics/cge.h"
#include "src/core/NEON/wrapper/intrinsics/cgt.h"
+#include "src/core/NEON/wrapper/intrinsics/cgtz.h"
#include "src/core/NEON/wrapper/intrinsics/cle.h"
#include "src/core/NEON/wrapper/intrinsics/clt.h"
#include "src/core/NEON/wrapper/intrinsics/combine.h"