diff options
Diffstat (limited to 'src/core/NEON')
-rw-r--r-- | src/core/NEON/NEAsymm.h | 85 | ||||
-rw-r--r-- | src/core/NEON/kernels/NEActivationLayerKernel.cpp | 5 | ||||
-rw-r--r-- | src/core/NEON/kernels/activation/impl/qasymm8_neon_activation.cpp | 81 | ||||
-rw-r--r-- | src/core/NEON/kernels/activation/impl/qasymm8_signed_neon_activation.cpp | 81 | ||||
-rw-r--r-- | src/core/NEON/wrapper/intrinsics/cgtz.h | 62 | ||||
-rw-r--r-- | src/core/NEON/wrapper/intrinsics/intrinsics.h | 1 |
6 files changed, 220 insertions, 95 deletions
diff --git a/src/core/NEON/NEAsymm.h b/src/core/NEON/NEAsymm.h index 70d48d5835..9b92a865d0 100644 --- a/src/core/NEON/NEAsymm.h +++ b/src/core/NEON/NEAsymm.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEASYMM_H #include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" #include <arm_neon.h> namespace arm_compute @@ -647,35 +648,39 @@ inline int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizat return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); } -/** Quantize a neon vector holding 16 floating point values. - * - * @param[in] qv Input values to be quantized. - * @param[in] qi Quantization information to be used in the computation. - * - * @return A neon vector holding the quantized values - */ -inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) +inline int32x4x4_t vquantize_internal(const float32x4x4_t &qv, float scale, int32_t offset) { - const float scale = qi.scale; - const int offset = qi.offset; - const float32x4_t voffset = vdupq_n_f32(offset); + const int32x4_t voffset = vdupq_n_s32(offset); const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); const int32x4x4_t rf = { { #ifdef __aarch64__ - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), + vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset), + vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset), + vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset), + vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset), #else //__aarch64__ - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), + vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset), + vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset), + vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset), + vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset), #endif //__aarch64__ } }; + return rf; +} + +/** Quantize a neon vector holding 16 floating point values. + * + * @param[in] qv Input values to be quantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return A neon vector holding the quantized values + */ +inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) +{ + auto rf = vquantize_internal(qv, qi.scale, qi.offset); const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); return vcombine_u8(pa, pb); @@ -690,26 +695,7 @@ inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationIn */ inline int8x16_t vquantize_signed(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) { - const float scale = qi.scale; - const int offset = qi.offset; - const float32x4_t voffset = vdupq_n_f32(offset); - const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); - const int32x4x4_t rf = - { - { -#ifdef __aarch64__ - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), -#else //__aarch64__ - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), -#endif //__aarch64__ - } - }; + auto rf = vquantize_internal(qv, qi.scale, qi.offset); const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); return vcombine_s8(pa, pb); @@ -724,26 +710,7 @@ inline int8x16_t vquantize_signed(const float32x4x4_t &qv, const UniformQuantiza */ inline uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) { - const float scale = qi.scale; - const int offset = qi.offset; - const float32x4_t voffset = vdupq_n_f32(offset); - const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); - const int32x4x4_t rf = - { - { -#ifdef __aarch64__ - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), - vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), -#else //__aarch64__ - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), - vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), -#endif //__aarch64__ - } - }; + auto rf = vquantize_internal(qv, qi.scale, qi.offset); const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1])); const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3])); return { pa, pb }; diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp index 51257cb96b..d969fd8e38 100644 --- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp @@ -110,7 +110,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, ActivationLayerInfo::ActivationFunction::LOGISTIC, ActivationLayerInfo::ActivationFunction::TANH, - ActivationLayerInfo::ActivationFunction::HARD_SWISH + ActivationLayerInfo::ActivationFunction::HARD_SWISH, + ActivationLayerInfo::ActivationFunction::LEAKY_RELU, }; const static std::set<ActivationLayerInfo::ActivationFunction> qsymm16_supported_activations = { @@ -123,7 +124,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation(); ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_asymmetric(data_type) && (qasymm8_supported_activations.count(f_act) == 0), - "For QASYMM8 only tanh, logistic, relu and lower/upper bounded relu are supported"); + "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) && (qsymm16_supported_activations.count(f_act) == 0), "For QSYMM16 only tanh and logistic are supported"); diff --git a/src/core/NEON/kernels/activation/impl/qasymm8_neon_activation.cpp b/src/core/NEON/kernels/activation/impl/qasymm8_neon_activation.cpp index 8a398fb531..7b26441824 100644 --- a/src/core/NEON/kernels/activation/impl/qasymm8_neon_activation.cpp +++ b/src/core/NEON/kernels/activation/impl/qasymm8_neon_activation.cpp @@ -51,23 +51,26 @@ void qasymm8_neon_activation(const ITensor *src, ITensor *dst, const ActivationL Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); - const qasymm8x16_t va = vdupq_n_u8(quantize_qasymm8(act_info.a(), qi_in)); - const qasymm8x16_t vb = vdupq_n_u8(quantize_qasymm8(act_info.b(), qi_in)); - const qasymm8_t a = quantize_qasymm8(act_info.a(), qi_in); - const qasymm8_t b = quantize_qasymm8(act_info.b(), qi_in); - const qasymm8_t const_0 = quantize_qasymm8(0.f, qi_in); - const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0); - const auto vconst_1 = vdupq_n_f32(1.f); - const float32x4_t va_f32 = vdupq_n_f32(act_info.a()); - const float32x4_t vb_f32 = vdupq_n_f32(act_info.b()); - const float a_f32 = act_info.a(); - const float b_f32 = act_info.b(); - const auto const_6_f32 = vdupq_n_f32(6.f); - const auto const_0_f32 = vdupq_n_f32(0.f); - const auto const_3_f32 = vdupq_n_f32(3.f); - const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f); + const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); + const qasymm8x16_t va = vdupq_n_u8(quantize_qasymm8(act_info.a(), qi_in)); + const qasymm8x16_t vb = vdupq_n_u8(quantize_qasymm8(act_info.b(), qi_in)); + const qasymm8_t a = quantize_qasymm8(act_info.a(), qi_in); + const qasymm8_t b = quantize_qasymm8(act_info.b(), qi_in); + const qasymm8_t const_0 = quantize_qasymm8(0.f, qi_in); + const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0); + const auto vconst_1 = vdupq_n_f32(1.f); +#ifndef __aarch64__ + const auto vconst_0_f32 = vdupq_n_f32(0); +#endif // __aarch64__ + const float32x4_t va_f32 = vdupq_n_f32(act_info.a()); + const float32x4_t vb_f32 = vdupq_n_f32(act_info.b()); + const float a_f32 = act_info.a(); + const float b_f32 = act_info.b(); + const auto const_6_f32 = vdupq_n_f32(6.f); + const auto const_0_f32 = vdupq_n_f32(0.f); + const auto const_3_f32 = vdupq_n_f32(3.f); + const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f); // Initialise scale/offset for re-quantization float s = qi_in.scale / qi_out.scale; @@ -159,6 +162,44 @@ void qasymm8_neon_activation(const ITensor *src, ITensor *dst, const ActivationL // Re-quantize to new output space tmp = vquantize(tmp_dep, qi_out); } + else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + const auto vin_deq = vdequantize(vin, qi_in); + +#ifdef __aarch64__ + const uint32x4x4_t pos_mask = + { + { + wrapper::vcgtz(vin_deq.val[0]), + wrapper::vcgtz(vin_deq.val[1]), + wrapper::vcgtz(vin_deq.val[2]), + wrapper::vcgtz(vin_deq.val[3]), + } + }; +#else // __aarch64__ + const uint32x4x4_t pos_mask = + { + { + wrapper::vcgt(vin_deq.val[0], vconst_0_f32), + wrapper::vcgt(vin_deq.val[1], vconst_0_f32), + wrapper::vcgt(vin_deq.val[2], vconst_0_f32), + wrapper::vcgt(vin_deq.val[3], vconst_0_f32), + } + }; +#endif // __aarch64__ + + const float32x4x4_t tmp_dep = + { + { + wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])), + wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])), + wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])), + wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])), + } + }; + + tmp = vquantize(tmp_dep, qi_out); + } else { ARM_COMPUTE_ERROR("Unsupported activation function"); @@ -204,6 +245,12 @@ void qasymm8_neon_activation(const ITensor *src, ITensor *dst, const ActivationL tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f); tmp = quantize_qasymm8(tmp_f, qi_out); } + else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32; + tmp = quantize_qasymm8(tmp_f, qi_out); + } else { ARM_COMPUTE_ERROR("Unsupported activation function"); diff --git a/src/core/NEON/kernels/activation/impl/qasymm8_signed_neon_activation.cpp b/src/core/NEON/kernels/activation/impl/qasymm8_signed_neon_activation.cpp index bfab07c8e3..c616c5e27d 100644 --- a/src/core/NEON/kernels/activation/impl/qasymm8_signed_neon_activation.cpp +++ b/src/core/NEON/kernels/activation/impl/qasymm8_signed_neon_activation.cpp @@ -50,23 +50,26 @@ void qasymm8_signed_neon_activation(const ITensor *src, ITensor *dst, const Acti Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); - const qasymm8x16_signed_t va = vdupq_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in)); - const qasymm8x16_signed_t vb = vdupq_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in)); - const qasymm8_signed_t a = quantize_qasymm8_signed(act_info.a(), qi_in); - const qasymm8_signed_t b = quantize_qasymm8_signed(act_info.b(), qi_in); - const qasymm8_signed_t const_0 = quantize_qasymm8_signed(0.f, qi_in); - const qasymm8x16_signed_t vconst_0 = vdupq_n_s8(const_0); - const auto vconst_1 = vdupq_n_f32(1.f); - const float32x4_t va_f32 = vdupq_n_f32(act_info.a()); - const float32x4_t vb_f32 = vdupq_n_f32(act_info.b()); - const float a_f32 = act_info.a(); - const float b_f32 = act_info.b(); - const auto const_6_f32 = vdupq_n_f32(6.f); - const auto const_0_f32 = vdupq_n_f32(0.f); - const auto const_3_f32 = vdupq_n_f32(3.f); - const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f); + const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); + const qasymm8x16_signed_t va = vdupq_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in)); + const qasymm8x16_signed_t vb = vdupq_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in)); + const qasymm8_signed_t a = quantize_qasymm8_signed(act_info.a(), qi_in); + const qasymm8_signed_t b = quantize_qasymm8_signed(act_info.b(), qi_in); + const qasymm8_signed_t const_0 = quantize_qasymm8_signed(0.f, qi_in); + const qasymm8x16_signed_t vconst_0 = vdupq_n_s8(const_0); + const auto vconst_1 = vdupq_n_f32(1.f); +#ifndef __aarch64__ + const auto vconst_0_f32 = vdupq_n_f32(1.f); +#endif // __aarch64__ + const float32x4_t va_f32 = vdupq_n_f32(act_info.a()); + const float32x4_t vb_f32 = vdupq_n_f32(act_info.b()); + const float a_f32 = act_info.a(); + const float b_f32 = act_info.b(); + const auto const_6_f32 = vdupq_n_f32(6.f); + const auto const_0_f32 = vdupq_n_f32(0.f); + const auto const_3_f32 = vdupq_n_f32(3.f); + const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f); // Initialise scale/offset for re-quantization float s = qi_in.scale / qi_out.scale; @@ -158,6 +161,44 @@ void qasymm8_signed_neon_activation(const ITensor *src, ITensor *dst, const Acti // Re-quantize to new output space tmp = vquantize_signed(tmp_dep, qi_out); } + else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + const auto vin_deq = vdequantize(vin, qi_in); + +#ifdef __aarch64__ + const uint32x4x4_t pos_mask = + { + { + wrapper::vcgtz(vin_deq.val[0]), + wrapper::vcgtz(vin_deq.val[1]), + wrapper::vcgtz(vin_deq.val[2]), + wrapper::vcgtz(vin_deq.val[3]), + } + }; +#else // __aarch64__ + const uint32x4x4_t pos_mask = + { + { + wrapper::vcgt(vin_deq.val[0], vconst_0_f32), + wrapper::vcgt(vin_deq.val[1], vconst_0_f32), + wrapper::vcgt(vin_deq.val[2], vconst_0_f32), + wrapper::vcgt(vin_deq.val[3], vconst_0_f32), + } + }; +#endif // __aarch64__ + + const float32x4x4_t tmp_dep = + { + { + wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])), + wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])), + wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])), + wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])), + } + }; + + tmp = vquantize_signed(tmp_dep, qi_out); + } else { ARM_COMPUTE_ERROR("Unsupported activation function"); @@ -203,6 +244,12 @@ void qasymm8_signed_neon_activation(const ITensor *src, ITensor *dst, const Acti tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f); tmp = quantize_qasymm8_signed(tmp_f, qi_out); } + else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32; + tmp = quantize_qasymm8_signed(tmp_f, qi_out); + } else { ARM_COMPUTE_ERROR("Unsupported activation function"); diff --git a/src/core/NEON/wrapper/intrinsics/cgtz.h b/src/core/NEON/wrapper/intrinsics/cgtz.h new file mode 100644 index 0000000000..025a7ba976 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/cgtz.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_WRAPPER_CGTZ_H +#define ARM_COMPUTE_WRAPPER_CGTZ_H + +#ifdef __aarch64__ +#include <arm_neon.h> + +namespace arm_compute +{ +namespace wrapper +{ +#define VCGTZ_IMPL(vtype, rtype, prefix, postfix) \ + inline rtype vcgtz(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +VCGTZ_IMPL(int8x8_t, uint8x8_t, vcgtz, s8) +VCGTZ_IMPL(int16x4_t, uint16x4_t, vcgtz, s16) +VCGTZ_IMPL(int32x2_t, uint32x2_t, vcgtz, s32) +VCGTZ_IMPL(float32x2_t, uint32x2_t, vcgtz, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VCGTZ_IMPL(float16x4_t, uint16x4_t, vcgtz, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VCGTZ_IMPL(int8x16_t, uint8x16_t, vcgtzq, s8) +VCGTZ_IMPL(int16x8_t, uint16x8_t, vcgtzq, s16) +VCGTZ_IMPL(int32x4_t, uint32x4_t, vcgtzq, s32) +VCGTZ_IMPL(float32x4_t, uint32x4_t, vcgtzq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VCGTZ_IMPL(float16x8_t, uint16x8_t, vcgtzq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VCGTZ_IMPL + +} // namespace wrapper +} // namespace arm_compute + +#endif // __aarch64__ +#endif /* ARM_COMPUTE_WRAPPER_CGTZ_H */ diff --git a/src/core/NEON/wrapper/intrinsics/intrinsics.h b/src/core/NEON/wrapper/intrinsics/intrinsics.h index 070f3c7065..c6bad3f9dd 100644 --- a/src/core/NEON/wrapper/intrinsics/intrinsics.h +++ b/src/core/NEON/wrapper/intrinsics/intrinsics.h @@ -31,6 +31,7 @@ #include "src/core/NEON/wrapper/intrinsics/ceq.h" #include "src/core/NEON/wrapper/intrinsics/cge.h" #include "src/core/NEON/wrapper/intrinsics/cgt.h" +#include "src/core/NEON/wrapper/intrinsics/cgtz.h" #include "src/core/NEON/wrapper/intrinsics/cle.h" #include "src/core/NEON/wrapper/intrinsics/clt.h" #include "src/core/NEON/wrapper/intrinsics/combine.h" |