diff options
22 files changed, 949 insertions, 809 deletions
diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl index 4de80509f0..27b4fc2c1b 100644 --- a/arm_compute/core/NEON/NEMath.inl +++ b/arm_compute/core/NEON/NEMath.inl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018 ARM Limited. + * Copyright (c) 2016-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -255,34 +255,12 @@ inline float16x8_t vexpq_f16(float16x8_t x) inline float16x8_t vlogq_f16(float16x8_t x) { - static const std::array<float16x8_t, 8> log_tab_f16 = - { - { - vdupq_n_f16(-2.29561495781f), - vdupq_n_f16(-2.47071170807f), - vdupq_n_f16(-5.68692588806f), - vdupq_n_f16(-0.165253549814f), - vdupq_n_f16(5.17591238022f), - vdupq_n_f16(0.844007015228f), - vdupq_n_f16(4.58445882797f), - vdupq_n_f16(0.0141278216615f), - } - }; - - static const int16x8_t CONST_127 = vdupq_n_s16(127); // 127 - static const float16x8_t CONST_LN2 = vdupq_n_f16(0.6931471805f); // ln(2) - - // Extract exponent - const int16x8_t m = vsubq_s16(vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_f16(x), 9)), CONST_127); - const float16x8_t val = vreinterpretq_f16_s16(vsubq_s16(vreinterpretq_s16_f16(x), vshlq_n_s16(m, 9))); - - // Polynomial Approximation - float16x8_t poly = vtaylor_polyq_f16(val, log_tab_f16); - - // Reconstruct - poly = vaddq_f16(poly, vmulq_f16(vcvtq_f16_s16(m), CONST_LN2)); + // TODO (COMPMID-1535) : Revisit FP16 approximations + const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x)); + const float32x4_t x_low = vcvt_f32_f16(vget_low_f16(x)); - return poly; + const float16x8_t res = vcvt_high_f16_f32(vcvt_f16_f32(vlogq_f32(x_low)), vlogq_f32(x_high)); + return res; } inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n) diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h index 0290e32085..447f4880ee 100644 --- a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -26,6 +26,7 @@ #include "arm_compute/core/NEON/INEKernel.h" #include "arm_compute/core/QAsymm8.h" +#include "arm_compute/core/utils/misc/Traits.h" #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #include <arm_fp16.h> @@ -89,15 +90,8 @@ private: * @param[in] window Region on which to execute the kernel */ template <ActivationLayerInfo::ActivationFunction F, typename T> - typename std::enable_if<std::is_same<T, float>::value, void>::type activation(const Window &window); -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - /** Function to apply an activation function on a tensor. - * - * @param[in] window Region on which to execute the kernel - */ - template <ActivationLayerInfo::ActivationFunction F, typename T> - typename std::enable_if<std::is_same<T, float16_t>::value, void>::type activation(const Window &window); -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type + activation(const Window &window); /** Function to apply an activation function on a tensor. * * @param[in] window Region on which to execute the kernel diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h index 73beca6ded..872c3a5b6b 100644 --- a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h +++ b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018 ARM Limited. + * Copyright (c) 2016-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -84,7 +84,6 @@ public: // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; - BorderSize border_size() const override; private: /** Common signature for all the specialised add functions @@ -92,14 +91,16 @@ private: * @param[in] input1 An input tensor. Data types supported: U8/QASYMM8/S16/F16/F32 * @param[in] input2 An input tensor. Data types supported: U8/QASYMM8/S16/F16/F32 * @param[out] output The output tensor. Data types supported: U8/QASYMM8/S16/F16/F32. + * @param[in] policy Overflow policy. * @param[in] window Region on which to execute the kernel. */ - using AddFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window); + using AddFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const Window &window); /** Add function to use for the particular tensor types passed to configure() */ AddFunction *_func; const ITensor *_input1; const ITensor *_input2; ITensor *_output; + ConvertPolicy _policy; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h index 9344235d09..bde3ac82e7 100644 --- a/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h +++ b/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -139,7 +139,7 @@ struct logistic */ void operator()(ExactType &vval) { - vval = wrapper::vinv(wrapper::vadd(vone, wrapper::vexpq(wrapper::vnegq(vval)))); + vval = wrapper::vinv(wrapper::vadd(vone, wrapper::vexpq(wrapper::vneg(vval)))); } /** Vector of ones. */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/abs.h b/arm_compute/core/NEON/wrapper/intrinsics/abs.h new file mode 100644 index 0000000000..97d11e951e --- /dev/null +++ b/arm_compute/core/NEON/wrapper/intrinsics/abs.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_WRAPPER_ABS_H__ +#define __ARM_COMPUTE_WRAPPER_ABS_H__ + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace wrapper +{ +#define VABS_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vabs(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +#define VQABS_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vqabs(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +// Absolute: vabs{q}_<type>. Vd[i] = |Va[i]| +VABS_IMPL(int8x8_t, int8x8_t, vabs, s8) +VABS_IMPL(int16x4_t, int16x4_t, vabs, s16) +VABS_IMPL(int32x2_t, int32x2_t, vabs, s32) +VABS_IMPL(float32x2_t, float32x2_t, vabs, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VABS_IMPL(float16x4_t, float16x4_t, vabs, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VABS_IMPL(int8x16_t, int8x16_t, vabsq, s8) +VABS_IMPL(int16x8_t, int16x8_t, vabsq, s16) +VABS_IMPL(int32x4_t, int32x4_t, vabsq, s32) +VABS_IMPL(float32x4_t, float32x4_t, vabsq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VABS_IMPL(float16x8_t, float16x8_t, vabsq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +// Saturating absolute: vqabs{q}_<type>. Vd[i] = sat(|Va[i]|) +VQABS_IMPL(int8x8_t, int8x8_t, vqabs, s8) +VQABS_IMPL(int16x4_t, int16x4_t, vqabs, s16) +VQABS_IMPL(int32x2_t, int32x2_t, vqabs, s32) + +VQABS_IMPL(int8x16_t, int8x16_t, vqabsq, s8) +VQABS_IMPL(int16x8_t, int16x8_t, vqabsq, s16) +VQABS_IMPL(int32x4_t, int32x4_t, vqabsq, s32) + +#undef VABS_IMPL +#undef VQABS_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* __ARM_COMPUTE_WRAPPER_ABS_H__ */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/add.h b/arm_compute/core/NEON/wrapper/intrinsics/add.h index da730f133c..4f4d244489 100644 --- a/arm_compute/core/NEON/wrapper/intrinsics/add.h +++ b/arm_compute/core/NEON/wrapper/intrinsics/add.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -61,8 +61,41 @@ VADD_IMPL(float32x4_t, float32x4_t, vaddq, f32) #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC VADD_IMPL(float16x8_t, float16x8_t, vaddq, f16) #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - #undef VADD_IMPL + +#define VQADD_IMPL(stype, vtype, prefix, postfix) \ + inline vtype vqadd(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +// VQADD: Vector saturating add (No notion of saturation for floating point) +VQADD_IMPL(uint8x8_t, uint8x8_t, vqadd, u8) +VQADD_IMPL(int8x8_t, int8x8_t, vqadd, s8) +VQADD_IMPL(uint16x4_t, uint16x4_t, vqadd, u16) +VQADD_IMPL(int16x4_t, int16x4_t, vqadd, s16) +VQADD_IMPL(uint32x2_t, uint32x2_t, vqadd, u32) +VQADD_IMPL(int32x2_t, int32x2_t, vqadd, s32) +VQADD_IMPL(uint64x1_t, uint64x1_t, vqadd, u64) +VQADD_IMPL(int64x1_t, int64x1_t, vqadd, s64) +VQADD_IMPL(float32x2_t, float32x2_t, vadd, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VQADD_IMPL(float16x4_t, float16x4_t, vadd, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VQADD_IMPL(uint8x16_t, uint8x16_t, vqaddq, u8) +VQADD_IMPL(int8x16_t, int8x16_t, vqaddq, s8) +VQADD_IMPL(uint16x8_t, uint16x8_t, vqaddq, u16) +VQADD_IMPL(int16x8_t, int16x8_t, vqaddq, s16) +VQADD_IMPL(uint32x4_t, uint32x4_t, vqaddq, u32) +VQADD_IMPL(int32x4_t, int32x4_t, vqaddq, s32) +VQADD_IMPL(uint64x2_t, uint64x2_t, vqaddq, u64) +VQADD_IMPL(int64x2_t, int64x2_t, vqaddq, s64) +VQADD_IMPL(float32x4_t, float32x4_t, vaddq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VQADD_IMPL(float16x8_t, float16x8_t, vaddq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#undef VQADD_IMPL } // namespace wrapper } // namespace arm_compute #endif /* __ARM_COMPUTE_WRAPPER_ADD_H__ */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/bsl.h b/arm_compute/core/NEON/wrapper/intrinsics/bsl.h index 9831b4b842..38f9d5f171 100644 --- a/arm_compute/core/NEON/wrapper/intrinsics/bsl.h +++ b/arm_compute/core/NEON/wrapper/intrinsics/bsl.h @@ -30,32 +30,32 @@ namespace arm_compute { namespace wrapper { -#define VBSL_IMPL(vctype, vtype, prefix, postfix) \ - inline vtype vbsl(const vctype &a, const vtype &b, const vtype &c) \ - { \ - return prefix##_##postfix(a, b, c); \ +#define VBSL_IMPL(stype, vtype, ctype, prefix, postfix) \ + inline vtype vbsl(const ctype &a, const vtype &b, const vtype &c) \ + { \ + return prefix##_##postfix(a, b, c); \ } -VBSL_IMPL(uint8x8_t, uint8x8_t, vbsl, u8) -VBSL_IMPL(uint8x8_t, int8x8_t, vbsl, s8) -VBSL_IMPL(uint16x4_t, uint16x4_t, vbsl, u16) -VBSL_IMPL(uint16x4_t, int16x4_t, vbsl, s16) -VBSL_IMPL(uint32x2_t, uint32x2_t, vbsl, u32) -VBSL_IMPL(uint32x2_t, int32x2_t, vbsl, s32) -VBSL_IMPL(uint32x2_t, float32x2_t, vbsl, f32) +VBSL_IMPL(uint8_t, uint8x8_t, uint8x8_t, vbsl, u8) +VBSL_IMPL(int8_t, int8x8_t, uint8x8_t, vbsl, s8) +VBSL_IMPL(uint16_t, uint16x4_t, uint16x4_t, vbsl, u16) +VBSL_IMPL(int16_t, int16x4_t, uint16x4_t, vbsl, s16) +VBSL_IMPL(uint32_t, uint32x2_t, uint32x2_t, vbsl, u32) +VBSL_IMPL(int32_t, int32x2_t, uint32x2_t, vbsl, s32) +VBSL_IMPL(float32x2_t, float32x2_t, uint32x2_t, vbsl, f32) #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VBSL_IMPL(uint16x4_t, float16x4_t, vbsl, f16) +VBSL_IMPL(float16x4_t, float16x4_t, uint16x4_t, vbsl, f16) #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VBSL_IMPL(uint8x16_t, uint8x16_t, vbslq, u8) -VBSL_IMPL(uint8x16_t, int8x16_t, vbslq, s8) -VBSL_IMPL(uint16x8_t, uint16x8_t, vbslq, u16) -VBSL_IMPL(uint16x8_t, int16x8_t, vbslq, s16) -VBSL_IMPL(uint32x4_t, uint32x4_t, vbslq, u32) -VBSL_IMPL(uint32x4_t, int32x4_t, vbslq, s32) -VBSL_IMPL(uint32x4_t, float32x4_t, vbslq, f32) +VBSL_IMPL(uint8_t, uint8x16_t, uint8x16_t, vbslq, u8) +VBSL_IMPL(int8_t, int8x16_t, uint8x16_t, vbslq, s8) +VBSL_IMPL(uint16_t, uint16x8_t, uint16x8_t, vbslq, u16) +VBSL_IMPL(int16_t, int16x8_t, uint16x8_t, vbslq, s16) +VBSL_IMPL(uint32_t, uint32x4_t, uint32x4_t, vbslq, u32) +VBSL_IMPL(int32_t, int32x4_t, uint32x4_t, vbslq, s32) +VBSL_IMPL(float32x4_t, float32x4_t, uint32x4_t, vbslq, f32) #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VBSL_IMPL(uint16x8_t, float16x8_t, vbslq, f16) +VBSL_IMPL(float16x8_t, float16x8_t, uint16x8_t, vbslq, f16) #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #undef VBSL_IMPL diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cgt.h b/arm_compute/core/NEON/wrapper/intrinsics/cgt.h index c2ed9df1dc..9563b0cd12 100644 --- a/arm_compute/core/NEON/wrapper/intrinsics/cgt.h +++ b/arm_compute/core/NEON/wrapper/intrinsics/cgt.h @@ -30,10 +30,10 @@ namespace arm_compute { namespace wrapper { -#define VCGT_IMPL(votype, vtype, prefix, postfix) \ - inline votype vcgt(const vtype &a, const vtype &b) \ - { \ - return prefix##_##postfix(a, b); \ +#define VCGT_IMPL(rtype, vtype, prefix, postfix) \ + inline rtype vcgt(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ } VCGT_IMPL(uint8x8_t, uint8x8_t, vcgt, u8) diff --git a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h index 896e5106ab..a0193ee3d2 100644 --- a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h +++ b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h @@ -24,6 +24,7 @@ #ifndef __ARM_COMPUTE_WRAPPER_INTRINSICS_H__ #define __ARM_COMPUTE_WRAPPER_INTRINSICS_H__ +#include "arm_compute/core/NEON/wrapper/intrinsics/abs.h" #include "arm_compute/core/NEON/wrapper/intrinsics/add.h" #include "arm_compute/core/NEON/wrapper/intrinsics/and.h" #include "arm_compute/core/NEON/wrapper/intrinsics/bsl.h" @@ -39,6 +40,7 @@ #include "arm_compute/core/NEON/wrapper/intrinsics/inv.h" #include "arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h" #include "arm_compute/core/NEON/wrapper/intrinsics/load.h" +#include "arm_compute/core/NEON/wrapper/intrinsics/log.h" #include "arm_compute/core/NEON/wrapper/intrinsics/max.h" #include "arm_compute/core/NEON/wrapper/intrinsics/min.h" #include "arm_compute/core/NEON/wrapper/intrinsics/mla.h" @@ -55,5 +57,6 @@ #include "arm_compute/core/NEON/wrapper/intrinsics/setlane.h" #include "arm_compute/core/NEON/wrapper/intrinsics/store.h" #include "arm_compute/core/NEON/wrapper/intrinsics/sub.h" +#include "arm_compute/core/NEON/wrapper/intrinsics/tanh.h" #endif /* __ARM_COMPUTE_WRAPPER_INTRINSICS_H__ */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/log.h b/arm_compute/core/NEON/wrapper/intrinsics/log.h new file mode 100644 index 0000000000..5367afb858 --- /dev/null +++ b/arm_compute/core/NEON/wrapper/intrinsics/log.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_WRAPPER_LOG_H__ +#define __ARM_COMPUTE_WRAPPER_LOG_H__ + +#include "arm_compute/core/NEON/NEMath.h" +#include <arm_neon.h> + +namespace arm_compute +{ +namespace wrapper +{ +#define VLOG_IMPL(vtype, prefix, postfix) \ + inline vtype vlog(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +VLOG_IMPL(float32x4_t, vlogq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VLOG_IMPL(float16x8_t, vlogq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#undef VLOG_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* __ARM_COMPUTE_WRAPPER_LOG_H__ */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/neg.h b/arm_compute/core/NEON/wrapper/intrinsics/neg.h index 0ea1d429fe..7072866003 100644 --- a/arm_compute/core/NEON/wrapper/intrinsics/neg.h +++ b/arm_compute/core/NEON/wrapper/intrinsics/neg.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -30,36 +30,29 @@ namespace arm_compute { namespace wrapper { -#define VNEG_IMPL(vtype, postfix) \ - inline vtype vneg(const vtype &a) \ - { \ - return vneg_##postfix(a); \ +#define VNEG_IMPL(vtype, prefix, postfix) \ + inline vtype vneg(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ } -VNEG_IMPL(int8x8_t, s8) -VNEG_IMPL(int16x4_t, s16) -VNEG_IMPL(int32x2_t, s32) -VNEG_IMPL(float32x2_t, f32) +VNEG_IMPL(int8x8_t, vneg, s8) +VNEG_IMPL(int16x4_t, vneg, s16) +VNEG_IMPL(int32x2_t, vneg, s32) +VNEG_IMPL(float32x2_t, vneg, f32) #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VNEG_IMPL(float16x4_t, f16) +VNEG_IMPL(float16x4_t, vneg, f16) #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#undef VNEG_IMPL -#define VNEGQ_IMPL(vtype, postfix) \ - inline vtype vnegq(const vtype &a) \ - { \ - return vnegq_##postfix(a); \ - } - -VNEGQ_IMPL(int8x16_t, s8) -VNEGQ_IMPL(int16x8_t, s16) -VNEGQ_IMPL(int32x4_t, s32) -VNEGQ_IMPL(float32x4_t, f32) +VNEG_IMPL(int8x16_t, vnegq, s8) +VNEG_IMPL(int16x8_t, vnegq, s16) +VNEG_IMPL(int32x4_t, vnegq, s32) +VNEG_IMPL(float32x4_t, vnegq, f32) #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -VNEGQ_IMPL(float16x8_t, f16) +VNEG_IMPL(float16x8_t, vnegq, f16) #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#undef VNEGQ_IMPL +#undef VNEG_IMPL } // namespace wrapper } // namespace arm_compute #endif /* __ARM_COMPUTE_WRAPPER_NEG_H__ */ diff --git a/arm_compute/core/NEON/wrapper/intrinsics/tanh.h b/arm_compute/core/NEON/wrapper/intrinsics/tanh.h new file mode 100644 index 0000000000..8a6978a767 --- /dev/null +++ b/arm_compute/core/NEON/wrapper/intrinsics/tanh.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_WRAPPER_TANH_H__ +#define __ARM_COMPUTE_WRAPPER_TANH_H__ + +#include "arm_compute/core/NEON/NEMath.h" +#include <arm_neon.h> + +namespace arm_compute +{ +namespace wrapper +{ +#define VTANH_IMPL(vtype, prefix, postfix) \ + inline vtype vtanh(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +VTANH_IMPL(float32x4_t, vtanhq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VTANH_IMPL(float16x8_t, vtanhq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#undef VTANH_IMPL +} // namespace wrapper +} // namespace arm_compute +#endif /* __ARM_COMPUTE_WRAPPER_TANH_H__ */ diff --git a/arm_compute/core/NEON/wrapper/scalar/add.h b/arm_compute/core/NEON/wrapper/scalar/add.h new file mode 100644 index 0000000000..cfb9040281 --- /dev/null +++ b/arm_compute/core/NEON/wrapper/scalar/add.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_WRAPPER_SCALAR_ADD_H__ +#define __ARM_COMPUTE_WRAPPER_SCALAR_ADD_H__ + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace wrapper +{ +inline uint8_t add_sat(const uint8_t &a, const uint8_t &b) +{ + const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 }; + const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 }; + return vget_lane_u8(vqadd_u8(va, vb), 0); +} + +inline int16_t add_sat(const int16_t &a, const int16_t &b) +{ + const int16x4_t va = { a, 0, 0, 0 }; + const int16x4_t vb = { b, 0, 0, 0 }; + return vget_lane_s16(vqadd_s16(va, vb), 0); +} + +inline float add_sat(const float &a, const float &b) +{ + // No notion of saturation exists in floating point + return a + b; +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +inline float16_t add_sat(const float16_t &a, const float16_t &b) +{ + // No notion of saturation exists in floating point + return a + b; +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +} // namespace wrapper +} // namespace arm_compute +#endif /* __ARM_COMPUTE_WRAPPER_SCALAR_ADD_H__ */ diff --git a/arm_compute/core/NEON/wrapper/scalar/scalar.h b/arm_compute/core/NEON/wrapper/scalar/scalar.h new file mode 100644 index 0000000000..a52e0ceb28 --- /dev/null +++ b/arm_compute/core/NEON/wrapper/scalar/scalar.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_WRAPPER_SCALAR_H__ +#define __ARM_COMPUTE_WRAPPER_SCALAR_H__ + +#include "arm_compute/core/NEON/wrapper/scalar/add.h" + +#endif /* __ARM_COMPUTE_WRAPPER_SCALAR_H__ */ diff --git a/arm_compute/core/NEON/wrapper/wrapper.h b/arm_compute/core/NEON/wrapper/wrapper.h index 61dc42a69b..60dba5c022 100644 --- a/arm_compute/core/NEON/wrapper/wrapper.h +++ b/arm_compute/core/NEON/wrapper/wrapper.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -29,5 +29,6 @@ // Intrinsics Overloads #include "arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "arm_compute/core/NEON/wrapper/scalar/scalar.h" #endif /* __ARM_COMPUTE_WRAPPER_H__ */ diff --git a/arm_compute/core/utils/misc/Traits.h b/arm_compute/core/utils/misc/Traits.h index 9d86dd1b3c..9f6e49a452 100644 --- a/arm_compute/core/utils/misc/Traits.h +++ b/arm_compute/core/utils/misc/Traits.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -41,6 +41,13 @@ template <> struct is_floating_point<half> : public std::true_type { }; + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> +struct is_floating_point<__fp16> : public std::true_type +{ +}; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/ } // namespace traits } // namespace utils } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp index 5ce79f1007..97cb9ceb2e 100644 --- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -29,6 +29,7 @@ #include "arm_compute/core/NEON/NEAsymm.h" #include "arm_compute/core/NEON/NEFixedPoint.h" #include "arm_compute/core/NEON/NEMath.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/QAsymm8.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" @@ -60,29 +61,21 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) { - constexpr unsigned int num_elems_processed_per_iteration = 16; - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - bool window_changed = false; + // Configure kernel window + Window win = calculate_max_window(*input, Steps()); - if(output != nullptr && (output->total_size() != 0)) + if(output != nullptr) { - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - - window_changed = update_window_and_padding(win, - AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration), - output_access); + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output, *input->clone()); - output_access.set_valid_region(win, input->valid_region()); - } - else - { - // In-place computation - window_changed = update_window_and_padding(win, - AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration)); + // NEActivationLayerKernel doesn't need padding so update_window_and_padding() can be skipped + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); } - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); + return std::make_pair(Status{}, win); } } // namespace @@ -101,15 +94,13 @@ void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, Activat if(output != nullptr) { - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), *input->info()->clone()); _output = output; } ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr)); - ARM_COMPUTE_ERROR_ON_MSG((input->info()->data_type() == DataType::QASYMM8) && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU), + ARM_COMPUTE_ERROR_ON_MSG((input->info()->data_type() == DataType::QASYMM8) && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU) + && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU), "For QASYMM8 only relu and lower/upper bounded relu are supported"); // Activation functions : FP32 @@ -176,337 +167,129 @@ void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, Activat ICPPKernel::configure(win_config.second); } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC template <ActivationLayerInfo::ActivationFunction F, typename T> -typename std::enable_if<std::is_same<T, float16_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window) +typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type +NEActivationLayerKernel::activation(const Window &window) { - Iterator input(_input, window); - Iterator output(_output, window); + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + const int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationFunction act = F; - static const float16x8_t CONST_0 = vdupq_n_f16(0.f); - static const float16x8_t CONST_1_H = vdupq_n_f16(1.f); + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - static const float32x4_t CONST_1_F32 = vdupq_n_f32(1.f); + Iterator input(_input, win_collapsed); + Iterator output(_output, win_collapsed); - const float16x8_t a = vdupq_n_f16(_act_info.a()); - const float16x4_t a_h = vdup_n_f16(_act_info.a()); - const float16x8_t b = vdupq_n_f16(_act_info.b()); + const auto const_1 = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{}); + const auto const_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + const auto va = wrapper::vdup_n(static_cast<T>(_act_info.a()), ExactTagType{}); + const auto vb = wrapper::vdup_n(static_cast<T>(_act_info.b()), ExactTagType{}); + const auto a = static_cast<T>(_act_info.a()); + const auto b = static_cast<T>(_act_info.b()); - execute_window_loop(window, [&](const Coordinates &) + execute_window_loop(win_collapsed, [&](const Coordinates & id) { - const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr()); + const auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + const auto output_ptr = reinterpret_cast<T *>(output.ptr()); - const float16x8x2_t in = vld2q_f16(input_ptr); - float16x8x2_t tmp = { {} }; + wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp; - switch(F) + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - case ActivationFunction::ABS: - tmp = - { - { - vabsq_f16(in.val[0]), - vabsq_f16(in.val[1]), - } - }; - break; - case ActivationFunction::BOUNDED_RELU: - tmp = - { - { - vminq_f16(a, vmaxq_f16(CONST_0, in.val[0])), - vminq_f16(a, vmaxq_f16(CONST_0, in.val[1])) - } - }; - break; - case ActivationFunction::LU_BOUNDED_RELU: - tmp = - { - { - vminq_f16(a, vmaxq_f16(b, in.val[0])), - vminq_f16(a, vmaxq_f16(b, in.val[1])) - } - }; - break; - case ActivationFunction::LINEAR: - tmp = - { - { - vaddq_f16(b, vmulq_f16(a, in.val[0])), - vaddq_f16(b, vmulq_f16(a, in.val[1])) - } - }; - break; - case ActivationFunction::LOGISTIC: - { - tmp = - { - { - vinvq_f16(vaddq_f16(CONST_1_H, vexpq_f16(vnegq_f16(in.val[0])))), - vinvq_f16(vaddq_f16(CONST_1_H, vexpq_f16(vnegq_f16(in.val[1])))) - } - }; - } - break; - case ActivationFunction::RELU: - tmp = - { - { - vmaxq_f16(CONST_0, in.val[0]), - vmaxq_f16(CONST_0, in.val[1]) - } - }; - break; - case ActivationFunction::LEAKY_RELU: - tmp = - { - { - vbslq_f16(vcgtq_f16(in.val[0], CONST_0), in.val[0], vmulq_f16(a, in.val[0])), - vbslq_f16(vcgtq_f16(in.val[1], CONST_0), in.val[1], vmulq_f16(a, in.val[1])) - } - }; - break; - case ActivationFunction::SOFT_RELU: - { - // TODO (COMPMID-1535) : Revisit FP16 approximations - const float16x4x2_t in0 = - { - vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_low_f16(in.val[0])))))), - vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_high_f16(in.val[0])))))), - }; - - const float16x4x2_t in1 = - { - vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_low_f16(in.val[1])))))), - vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_high_f16(in.val[1])))))), - }; - - tmp = - { - { - vcombine_f16(in0.val[0], in0.val[1]), - vcombine_f16(in1.val[0], in1.val[1]), - } - }; - } - break; - case ActivationFunction::SQRT: - tmp = - { - { - vinvq_f16(vinvsqrtq_f16(in.val[0])), - vinvq_f16(vinvsqrtq_f16(in.val[1])), - } - }; - break; - case ActivationFunction::SQUARE: - tmp = - { - { - vmulq_f16(in.val[0], in.val[0]), - vmulq_f16(in.val[1], in.val[1]) - } - }; - break; - case ActivationFunction::TANH: + const auto vin = wrapper::vloadq(input_ptr + x); + switch(act) { - // TODO (COMPMID-1535) : Revisit FP16 approximations - const float16x8x2_t mul = - { - vmulq_f16(b, in.val[0]), - vmulq_f16(b, in.val[1]) - }; - const float16x4x2_t in0 = - { - vmul_f16(a_h, vcvt_f16_f32(vtanhq_f32(vcvt_f32_f16(vget_low_f16(mul.val[0]))))), - vmul_f16(a_h, vcvt_f16_f32(vtanhq_f32(vcvt_f32_f16(vget_high_f16(mul.val[0]))))), - }; - - const float16x4x2_t in1 = - { - vmul_f16(a_h, vcvt_f16_f32(vtanhq_f32(vcvt_f32_f16(vget_low_f16(mul.val[1]))))), - vmul_f16(a_h, vcvt_f16_f32(vtanhq_f32(vcvt_f32_f16(vget_high_f16(mul.val[1]))))), - }; - - tmp = - { - { - vcombine_f16(in0.val[0], in0.val[1]), - vcombine_f16(in1.val[0], in1.val[1]), - } - }; + case ActivationFunction::ABS: + tmp = wrapper::vabs(vin); + break; + case ActivationFunction::LINEAR: + tmp = wrapper::vmla(vb, va, vin); + break; + case ActivationFunction::LOGISTIC: + tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin)))); + break; + case ActivationFunction::RELU: + tmp = wrapper::vmax(const_0, vin); + break; + case ActivationFunction::BOUNDED_RELU: + tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin)); + break; + case ActivationFunction::LU_BOUNDED_RELU: + tmp = wrapper::vmin(va, wrapper::vmax(vb, vin)); + break; + case ActivationFunction::LEAKY_RELU: + tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin)); + break; + case ActivationFunction::SOFT_RELU: + tmp = wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))); + break; + case ActivationFunction::SQRT: + tmp = wrapper::vinv(wrapper::vinvsqrt(vin)); + break; + case ActivationFunction::SQUARE: + tmp = wrapper::vmul(vin, vin); + break; + case ActivationFunction::TANH: + tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin))); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); } - break; - default: - ARM_COMPUTE_ERROR("Not implemented"); - break; + wrapper::vstore(output_ptr + x, tmp); } - vst2q_f16(output_ptr, tmp); - }, - input, output); -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - -template <ActivationLayerInfo::ActivationFunction F, typename T> -typename std::enable_if<std::is_same<T, float>::value, void>::type NEActivationLayerKernel::activation(const Window &window) -{ - Iterator input(_input, window); - Iterator output(_output, window); - - static const float32x4_t CONST_1 = vdupq_n_f32(1.f); - static const float32x4_t CONST_0 = vdupq_n_f32(0.f); - const float32x4_t a = vdupq_n_f32(_act_info.a()); - const float32x4_t b = vdupq_n_f32(_act_info.b()); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto input_ptr = reinterpret_cast<const float *>(input.ptr()); - const auto output_ptr = reinterpret_cast<float *>(output.ptr()); - - const float32x4x4_t in = + // Compute left-over elements + for(; x < window_end_x; ++x) { + const T in = *(reinterpret_cast<const T *>(input_ptr + x)); + T tmp; + switch(act) { - vld1q_f32(input_ptr), - vld1q_f32(input_ptr + 4), - vld1q_f32(input_ptr + 8), - vld1q_f32(input_ptr + 12) + case ActivationFunction::ABS: + tmp = std::abs(in); + break; + case ActivationFunction::LINEAR: + tmp = a * in + b; + break; + case ActivationFunction::LOGISTIC: + tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in)); + break; + case ActivationFunction::RELU: + tmp = std::max<T>(static_cast<T>(0), in); + break; + case ActivationFunction::BOUNDED_RELU: + tmp = std::min<T>(a, std::max(static_cast<T>(0), in)); + break; + case ActivationFunction::LU_BOUNDED_RELU: + tmp = std::min<T>(a, std::max<T>(b, in)); + break; + case ActivationFunction::LEAKY_RELU: + tmp = (in > 0) ? in : a * in; + break; + case ActivationFunction::SOFT_RELU: + tmp = std::log(static_cast<T>(1) + std::exp(in)); + break; + case ActivationFunction::SQRT: + tmp = std::sqrt(in); + break; + case ActivationFunction::SQUARE: + tmp = in * in; + break; + case ActivationFunction::TANH: + tmp = a * std::tanh(b * in); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); } - }; - float32x4x4_t tmp = { {} }; - - switch(F) - { - case ActivationFunction::ABS: - tmp = - { - { - vabsq_f32(in.val[0]), - vabsq_f32(in.val[1]), - vabsq_f32(in.val[2]), - vabsq_f32(in.val[3]), - } - }; - break; - case ActivationFunction::LINEAR: - tmp = - { - { - vmlaq_f32(b, a, in.val[0]), - vmlaq_f32(b, a, in.val[1]), - vmlaq_f32(b, a, in.val[2]), - vmlaq_f32(b, a, in.val[3]), - } - }; - break; - case ActivationFunction::LOGISTIC: - tmp = - { - { - vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[0])))), - vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[1])))), - vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[2])))), - vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[3])))), - } - }; - break; - case ActivationFunction::RELU: - tmp = - { - { - vmaxq_f32(CONST_0, in.val[0]), - vmaxq_f32(CONST_0, in.val[1]), - vmaxq_f32(CONST_0, in.val[2]), - vmaxq_f32(CONST_0, in.val[3]), - } - }; - break; - case ActivationFunction::BOUNDED_RELU: - tmp = - { - { - vminq_f32(a, vmaxq_f32(CONST_0, in.val[0])), - vminq_f32(a, vmaxq_f32(CONST_0, in.val[1])), - vminq_f32(a, vmaxq_f32(CONST_0, in.val[2])), - vminq_f32(a, vmaxq_f32(CONST_0, in.val[3])), - } - }; - break; - case ActivationFunction::LU_BOUNDED_RELU: - tmp = - { - { - vminq_f32(a, vmaxq_f32(b, in.val[0])), - vminq_f32(a, vmaxq_f32(b, in.val[1])), - vminq_f32(a, vmaxq_f32(b, in.val[2])), - vminq_f32(a, vmaxq_f32(b, in.val[3])), - } - }; - break; - case ActivationFunction::LEAKY_RELU: - tmp = - { - { - vbslq_f32(vcgtq_f32(in.val[0], CONST_0), in.val[0], vmulq_f32(a, in.val[0])), - vbslq_f32(vcgtq_f32(in.val[1], CONST_0), in.val[1], vmulq_f32(a, in.val[1])), - vbslq_f32(vcgtq_f32(in.val[2], CONST_0), in.val[2], vmulq_f32(a, in.val[2])), - vbslq_f32(vcgtq_f32(in.val[3], CONST_0), in.val[3], vmulq_f32(a, in.val[3])), - } - }; - break; - case ActivationFunction::SOFT_RELU: - tmp = - { - { - vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[0]))), - vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[1]))), - vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[2]))), - vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[3]))), - } - }; - break; - case ActivationFunction::SQRT: - tmp = - { - { - vinvq_f32(vinvsqrtq_f32(in.val[0])), - vinvq_f32(vinvsqrtq_f32(in.val[1])), - vinvq_f32(vinvsqrtq_f32(in.val[2])), - vinvq_f32(vinvsqrtq_f32(in.val[3])), - } - }; - break; - case ActivationFunction::SQUARE: - tmp = - { - { - vmulq_f32(in.val[0], in.val[0]), - vmulq_f32(in.val[1], in.val[1]), - vmulq_f32(in.val[2], in.val[2]), - vmulq_f32(in.val[3], in.val[3]), - } - }; - break; - case ActivationFunction::TANH: - tmp = - { - { - vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[0]))), - vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[1]))), - vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[2]))), - vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[3]))), - } - }; - break; - default: - break; + *(output_ptr + x) = tmp; } - - vst1q_f32(output_ptr, tmp.val[0]); - vst1q_f32(output_ptr + 4, tmp.val[1]); - vst1q_f32(output_ptr + 8, tmp.val[2]); - vst1q_f32(output_ptr + 12, tmp.val[3]); }, input, output); } @@ -514,13 +297,25 @@ typename std::enable_if<std::is_same<T, float>::value, void>::type NEActivationL template <ActivationLayerInfo::ActivationFunction F, typename T> typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window) { - Iterator input(_input, window); - Iterator output(_output, window); - const QuantizationInfo qi_in = _input->info()->quantization_info(); - const QuantizationInfo qi_out = _output->info()->quantization_info(); - const qasymm8x16_t a = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.a(), qi_in.scale, qi_in.offset)); - const qasymm8x16_t b = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset)); - const qasymm8x16_t CONST_0 = vdupq_n_u8(sqcvt_qasymm8_f32(0.f, qi_in.scale, qi_in.offset)); + const int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationFunction act = F; + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(_input, win_collapsed); + Iterator output(_output, win_collapsed); + + const QuantizationInfo qi_in = _input->info()->quantization_info(); + const QuantizationInfo qi_out = _output->info()->quantization_info(); + const qasymm8x16_t va = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.a(), qi_in.scale, qi_in.offset)); + const qasymm8x16_t vb = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset)); + const qasymm8_t a = sqcvt_qasymm8_f32(_act_info.a(), qi_in.scale, qi_in.offset); + const qasymm8_t b = sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset); + const qasymm8_t const_0 = sqcvt_qasymm8_f32(0.f, qi_in.scale, qi_in.offset); + const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0); // Initialise scale/offset for re-quantization float s = qi_in.scale / qi_out.scale; @@ -528,34 +323,72 @@ typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type NEActivat float32x4_t vs = vdupq_n_f32(s); float32x4_t vo = vdupq_n_f32(o); - execute_window_loop(window, [&](const Coordinates & id) + execute_window_loop(win_collapsed, [&](const Coordinates & id) { - const auto input_ptr = reinterpret_cast<const qasymm8_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr()); + const auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + const auto output_ptr = reinterpret_cast<T *>(output.ptr()); - const qasymm8x16_t in = vld1q_u8(input_ptr); - qasymm8x16_t tmp = {}; + wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp; - switch(F) + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - case ActivationFunction::LU_BOUNDED_RELU: + const auto vin = wrapper::vloadq(input_ptr + x); + if(act == ActivationFunction::RELU) + { // Perform activation - tmp = vminq_u8(a, vmaxq_u8(b, in)); + tmp = vmaxq_u8(vconst_0, vin); // Re-quantize to new output space tmp = vmlaq_qasymm8(tmp, vs, vo); - break; - case ActivationFunction::RELU: + } + else if(act == ActivationFunction::BOUNDED_RELU) + { // Perform activation - tmp = vmaxq_u8(CONST_0, in); + tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin)); // Re-quantize to new output space tmp = vmlaq_qasymm8(tmp, vs, vo); - break; - default: - ARM_COMPUTE_ERROR("Function not implemented"); - break; + } + else if(act == ActivationFunction::LU_BOUNDED_RELU) + { + // Perform activation + tmp = vminq_u8(va, vmaxq_u8(vb, vin)); + // Re-quantize to new output space + tmp = vmlaq_qasymm8(tmp, vs, vo); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + wrapper::vstore(output_ptr + x, tmp); } - vst1q_u8(output_ptr, tmp); + // Compute left-over elements + for(; x < window_end_x; ++x) + { + T in = *(reinterpret_cast<const T *>(input_ptr + x)); + T tmp; + if(act == ActivationFunction::RELU) + { + tmp = std::max(const_0, in); + tmp = std::max(0, std::min(static_cast<int32_t>(tmp * s + o), 255)); + } + else if(act == ActivationFunction::BOUNDED_RELU) + { + tmp = std::min(a, std::max(const_0, in)); + tmp = std::max(0, std::min(static_cast<int32_t>(tmp * s + o), 255)); + } + else if(act == ActivationFunction::LU_BOUNDED_RELU) + { + tmp = std::min(a, std::max(b, in)); + tmp = std::max(0, std::min(static_cast<int32_t>(tmp * s + o), 255)); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + *(output_ptr + x) = tmp; + } }, input, output); } diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp index 954a2c1754..e74833cd41 100644 --- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp +++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018 ARM Limited. + * Copyright (c) 2016-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -29,6 +29,7 @@ #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" @@ -47,337 +48,413 @@ class Coordinates; namespace { -constexpr unsigned int num_elems_processed_per_iteration = 16; - -void add_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +template <typename T, bool is_sat> +void add_same(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window) { - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - vst1q_u8(output.ptr(), vaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr()))); - }, - input1, input2, output); -} + ARM_COMPUTE_UNUSED(policy); -void add_saturate_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; - execute_window_loop(window, [&](const Coordinates & id) - { - vst1q_u8(output.ptr(), vqaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr()))); - }, - input1, input2, output); -} + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); -inline int16x8x2_t vadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b) -{ - const int16x8x2_t res = - { - { - vaddq_s16(a.val[0], b.val[0]), - vaddq_s16(a.val[1], b.val[1]) - } - }; + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); - return res; -} + constexpr int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0); -inline float32x4x4_t vadd4q_f32(const float32x4x4_t &a, const float32x4x4_t &b) -{ - const float32x4x4_t res = + if(is_broadcast_across_x) { - { - vaddq_f32(a.val[0], b.val[0]), - vaddq_f32(a.val[1], b.val[1]), - vaddq_f32(a.val[2], b.val[2]), - vaddq_f32(a.val[3], b.val[3]) - } - }; + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - return res; -} + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); -inline int16x8x2_t vqadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b) -{ - const int16x8x2_t res = - { + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates & id) { - vqaddq_s16(a.val[0], b.val[0]), - vqaddq_s16(a.val[1], b.val[1]) - } - }; + const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<T *>(output.ptr()); - return res; -} + const T broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -inline float16x8x2_t vadd2q_f16(const float16x8x2_t &a, const float16x8x2_t &b) -{ - const float16x8x2_t res = + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + const auto res = is_sat ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = is_sat ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v; + } + }, + broadcast_input, non_broadcast_input, output); + } + else { - { - vaddq_f16(a.val[0], b.val[0]), - vaddq_f16(a.val[1], b.val[1]) - } - }; + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - return res; -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); -void add_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + execute_window_loop(win, [&](const Coordinates & id) + { + const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<T *>(output.ptr()); - execute_window_loop(window, [&](const Coordinates & id) - { - const float16x8x2_t a = vld2q_f16(reinterpret_cast<const float16_t *>(input1.ptr())); - const float16x8x2_t b = vld2q_f16(reinterpret_cast<const float16_t *>(input2.ptr())); + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto val1 = wrapper::vloadq(input1_ptr + x); + const auto val2 = wrapper::vloadq(input2_ptr + x); + const auto res = is_sat ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2); + wrapper::vstore(output_ptr + x, res); + } - vst2q_f16(reinterpret_cast<float16_t *>(output.ptr()), vadd2q_f16(a, b)); - }, - input1, input2, output); -#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - ARM_COMPUTE_UNUSED(in1); - ARM_COMPUTE_UNUSED(in2); - ARM_COMPUTE_UNUSED(out); - ARM_COMPUTE_UNUSED(window); - ARM_COMPUTE_ERROR("Not supported, recompile the library with arch=arm64-v8.2-a"); -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto val1 = *(input1_ptr + x); + const auto val2 = *(input2_ptr + x); + *(output_ptr + x) = is_sat ? wrapper::add_sat(val1, val2) : val1 + val2; + } + }, + input1, input2, output); + } } -void add_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +void add_QASYMM8_QASYMM8_QASYMM8(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window) { - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + ARM_COMPUTE_UNUSED(policy); - execute_window_loop(window, [&](const Coordinates & id) - { - const float32x4x4_t a = vld4q_f32(reinterpret_cast<const float *>(input1.ptr())); - const float32x4x4_t b = vld4q_f32(reinterpret_cast<const float *>(input2.ptr())); + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - vst4q_f32(reinterpret_cast<float *>(output.ptr()), vadd4q_f32(a, b)); - }, - input1, input2, output); -} + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); -void add_QASYMM8_QASYMM8_QASYMM8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0); + + const float output_scale = out->info()->quantization_info().scale; + const float invoutput_scale = 1.f / output_scale; + const int output_offset = out->info()->quantization_info().offset; const float32x4_t vscale1 = vdupq_n_f32(in1->info()->quantization_info().scale); const float32x4_t vscale2 = vdupq_n_f32(in2->info()->quantization_info().scale); - const float32x4_t invvscaleo = vdupq_n_f32(1.f / out->info()->quantization_info().scale); + const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_scale); const int32x4_t voffset1 = vdupq_n_s32(in1->info()->quantization_info().offset); const int32x4_t voffset2 = vdupq_n_s32(in2->info()->quantization_info().offset); - const float32x4_t voffseto = vdupq_n_f32(out->info()->quantization_info().offset); + const float32x4_t voffseto = vdupq_n_f32(output_offset); - execute_window_loop(window, [&](const Coordinates & id) + if(is_broadcast_across_x) { - const uint8x16_t a = vld1q_u8(input1.ptr()); - const uint8x16_t b = vld1q_u8(input2.ptr()); - - const float32x4x4_t af = + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + const QuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info(); + const QuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info(); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates & id) { + const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr()); + const uint8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value); + + const float32x4x4_t bf = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2), + } + }; + const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale; + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1), + const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x); + const float32x4x4_t af = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1), + } + }; + + const int32x4x4_t rf = + { + { + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)), + } + }; + + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + vst1q_u8(output_ptr + x, vcombine_u8(pa, pb)); } - }; - const float32x4x4_t bf = - { + // Compute left-over elements + for(; x < window_end_x; ++x) { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2), + const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale; + *(output_ptr + x) = std::max(0, std::min(static_cast<int32_t>((afs + bfs) * invoutput_scale + output_offset), 255)); } - }; + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const QuantizationInfo input1_qinfo = in1->info()->quantization_info(); + const QuantizationInfo input2_qinfo = in2->info()->quantization_info(); - const int32x4x4_t rf = + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates & id) { + const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)), - vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)), + const uint8x16_t a = vld1q_u8(input1_ptr + x); + const uint8x16_t b = vld1q_u8(input2_ptr + x); + + const float32x4x4_t af = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1), + } + }; + + const float32x4x4_t bf = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2), + } + }; + + const int32x4x4_t rf = + { + { + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)), + } + }; + + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + vst1q_u8(output_ptr + x, vcombine_u8(pa, pb)); } - }; - const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); - const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); - vst1q_u8(output.ptr(), vcombine_u8(pa, pb)); - }, - input1, input2, output); + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float afs = static_cast<int32_t>((*(input1_ptr + x)) - input1_qinfo.offset) * input1_qinfo.scale; + const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - input2_qinfo.offset) * input2_qinfo.scale; + *(output_ptr + x) = std::max(0, std::min(static_cast<int32_t>((afs + bfs) * invoutput_scale + output_offset), 255)); + } + }, + input1, input2, output); + } } -void add_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +void add_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window) { - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - execute_window_loop(window, [&](const Coordinates & id) - { - const int16x8x2_t a = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr())); - const int16x8x2_t b = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr())); + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), vadd2q_s16(a, b)); - }, - input1, input2, output); -} + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); -void add_saturate_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + const int window_step_x = 8; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); - execute_window_loop(window, [&](const Coordinates & id) + execute_window_loop(win, [&](const Coordinates & id) { - const int16x8x2_t a = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr())); - const int16x8x2_t b = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr())); - - vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqadd2q_s16(a, b)); - }, - input1, input2, output); -} - -void add_wrap_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); - execute_window_loop(window, [&](const Coordinates & id) - { - const int16x8x2_t a = + if(policy == ConvertPolicy::WRAP) { + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())), - vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8) + const auto vin1 = wrapper::vloadq(input1_ptr + x); + const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); + wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2)); } - }; - const uint8x16_t b = vld1q_u8(input2.ptr()); - - vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))))); - vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b))))); - }, - input1, input2, output); -} - -void add_saturate_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); - execute_window_loop(window, [&](const Coordinates & id) - { - const int16x8x2_t a = + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(output_ptr + x) = *(input1_ptr + x) + static_cast<int16_t>(*(input2_ptr + x)); + } + } + else { + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())), - vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8) + const auto vin1 = wrapper::vloadq(input1_ptr + x); + const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); + wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2)); } - }; - const uint8x16_t b = vld1q_u8(input2.ptr()); - vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))))); - vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vqaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b))))); + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(output_ptr + x) = wrapper::add_sat(*(input1_ptr + x), static_cast<int16_t>(*(input2_ptr + x))); + } + } }, input1, input2, output); } -inline void add_wrap_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window) +inline void add_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const Window &window) { - //Simply swap the two input buffers: - add_wrap_S16_U8_S16(input2, input1, output, window); + // Simply swap the two input buffers: + add_S16_U8_S16(input2, input1, output, policy, window); } -inline void add_saturate_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window) +void add_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window) { - //Simply swap the two input buffers: - add_saturate_S16_U8_S16(input2, input1, output, window); -} + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); -void add_wrap_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - execute_window_loop(window, [&](const Coordinates & id) + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + const int window_step_x = 8; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + execute_window_loop(win, [&](const Coordinates & id) { - const uint8x16_t a = vld1q_u8(input1.ptr()); - const uint8x16_t b = vld1q_u8(input2.ptr()); + const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); - const int16x8x2_t a_s16 = + if(policy == ConvertPolicy::WRAP) { + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))) + const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x))); + const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); + wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2)); } - }; - const int16x8x2_t b_s16 = - { + // Compute left-over elements + for(; x < window_end_x; ++x) { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b))) + *(output_ptr + x) = static_cast<int16_t>(*(input1_ptr + x)) + static_cast<int16_t>(*(input2_ptr + x)); } - }; - - vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vaddq_s16(a_s16.val[0], b_s16.val[0])); - vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vaddq_s16(a_s16.val[1], b_s16.val[1])); - }, - input1, input2, output); -} - -void add_saturate_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - const uint8x16_t a = vld1q_u8(input1.ptr()); - const uint8x16_t b = vld1q_u8(input2.ptr()); - - const int16x8x2_t a_s16 = + } + else { + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))) + const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x))); + const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x))); + wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2)); } - }; - const int16x8x2_t b_s16 = - { + // Compute left-over elements + for(; x < window_end_x; ++x) { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b))) + *(output_ptr + x) = wrapper::add_sat(static_cast<int16_t>(*(input1_ptr + x)), + static_cast<int16_t>(*(input2_ptr + x))); } - }; - - vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqaddq_s16(a_s16.val[0], b_s16.val[0])); - vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vqaddq_s16(a_s16.val[1], b_s16.val[1])); + } }, input1, input2, output); } @@ -393,6 +470,9 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input1.tensor_shape().x() != input2.tensor_shape().x()) && ((input1.data_type() != input2.data_type()) || (input1.data_type() != output.data_type()) + || (input2.data_type() != output.data_type())), + "Broadcasting across width is supported on configurations where all tensors have the same data type"); // Validate in case of configured output if(output.total_size() > 0) @@ -443,27 +523,20 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITe } } - Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); - Window win_input1 = win.broadcast_if_dimension_le_one(input1); - Window win_input2 = win.broadcast_if_dimension_le_one(input2); + Window win = calculate_max_window(valid_region, Steps()); - AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration); + // NEArithmeticAdditionKernel doesn't need padding so update_window_and_padding() can be skipped + Coordinates coord; + coord.set_num_dimensions(output.num_dimensions()); + output.set_valid_region(valid_region); - bool window_changed = update_window_and_padding(win_input1, input1_access) - || update_window_and_padding(win_input2, input2_access) - || update_window_and_padding(win, output_access); - - output_access.set_valid_region(win, valid_region); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); + return std::make_pair(Status{}, win); + ; } } // namespace NEArithmeticAdditionKernel::NEArithmeticAdditionKernel() - : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr) + : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _policy() { } @@ -478,27 +551,30 @@ void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor static std::map<std::string, AddFunction *> map_function = { - { "add_wrap_U8_U8_U8", &add_wrap_U8_U8_U8 }, - { "add_saturate_U8_U8_U8", &add_saturate_U8_U8_U8 }, - { "add_wrap_S16_U8_S16", &add_wrap_S16_U8_S16 }, - { "add_saturate_S16_U8_S16", &add_saturate_S16_U8_S16 }, - { "add_wrap_U8_S16_S16", &add_wrap_U8_S16_S16 }, - { "add_saturate_U8_S16_S16", &add_saturate_U8_S16_S16 }, - { "add_wrap_U8_U8_S16", &add_wrap_U8_U8_S16 }, - { "add_saturate_U8_U8_S16", &add_saturate_U8_U8_S16 }, - { "add_wrap_S16_S16_S16", &add_wrap_S16_S16_S16 }, - { "add_saturate_S16_S16_S16", &add_saturate_S16_S16_S16 }, - { "add_wrap_F32_F32_F32", &add_F32_F32_F32 }, - { "add_saturate_F32_F32_F32", &add_F32_F32_F32 }, - { "add_wrap_F16_F16_F16", &add_F16_F16_F16 }, - { "add_saturate_F16_F16_F16", &add_F16_F16_F16 }, { "add_wrap_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 }, { "add_saturate_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 }, + { "add_wrap_U8_U8_U8", &add_same<uint8_t, false> }, + { "add_saturate_U8_U8_U8", &add_same<uint8_t, true> }, + { "add_wrap_S16_U8_S16", &add_S16_U8_S16 }, + { "add_saturate_S16_U8_S16", &add_S16_U8_S16 }, + { "add_wrap_U8_S16_S16", &add_U8_S16_S16 }, + { "add_saturate_U8_S16_S16", &add_U8_S16_S16 }, + { "add_wrap_U8_U8_S16", &add_U8_U8_S16 }, + { "add_saturate_U8_U8_S16", &add_U8_U8_S16 }, + { "add_wrap_S16_S16_S16", &add_same<int16_t, false> }, + { "add_saturate_S16_S16_S16", &add_same<int16_t, true> }, + { "add_wrap_F32_F32_F32", &add_same<float, false> }, + { "add_saturate_F32_F32_F32", &add_same<float, false> }, +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + { "add_wrap_F16_F16_F16", &add_same<float16_t, false> }, + { "add_saturate_F16_F16_F16", &add_same<float16_t, false> }, +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ }; _input1 = input1; _input2 = input2; _output = output; + _policy = policy; std::string function_to_call("add_"); function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_"; @@ -533,12 +609,5 @@ void NEArithmeticAdditionKernel::run(const Window &window, const ThreadInfo &inf ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); ARM_COMPUTE_ERROR_ON(_func == nullptr); - (*_func)(_input1, _input2, _output, window); -} - -BorderSize NEArithmeticAdditionKernel::border_size() const -{ - const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); - const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); - return BorderSize(0, border, 0, 0); + (*_func)(_input1, _input2, _output, _policy, window); } diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp index 024c4f8863..f0ac695b20 100644 --- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -66,37 +66,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output) +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) { - // Note: This kernel performs 16 elements per iteration. - // However, since we use a left-over for loop, we cannot have any read or write out of memory - // For this reason num_elems_processed_per_iteration is set to 1 - constexpr unsigned int num_elems_processed_per_iteration = 1; - // Output auto inizialitation if not yet initialized auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8)); // Configure kernel window - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win, - input_access); - - if(output->total_size() != 0) - { - output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); - } + Window win = calculate_max_window(*input, Steps()); - if(bias != nullptr) - { - AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1)); - window_changed = window_changed || update_window_and_padding(win, bias_access); - } + // NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel doesn't need padding so update_window_and_padding() can be skipped + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); + return std::make_pair(Status{}, win); } } // namespace @@ -269,7 +252,7 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const _max = max; // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info()); + auto win_config = validate_and_configure_window(input->info(), output->info()); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); INEKernel::configure(win_config.second); @@ -282,10 +265,7 @@ Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), - (bias != nullptr) ? bias->clone().get() : nullptr, - output->clone().get()) - .first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); return Status{}; } diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp index 677e9f676f..b1550778c3 100644 --- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp +++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -36,16 +36,6 @@ void NEArithmeticAddition::configure(ITensor *input1, ITensor *input2, ITensor * auto k = arm_compute::support::cpp14::make_unique<NEArithmeticAdditionKernel>(); k->configure(input1, input2, output, policy); _kernel = std::move(k); - - if(output->info()->dimension(0) > 1) - { - ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; - - if(broadcasted_info->info()->dimension(0) == 1) - { - _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); - } - } } Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy) { diff --git a/tests/validation/NEON/ActivationLayer.cpp b/tests/validation/NEON/ActivationLayer.cpp index cd35504465..589b53fe7b 100644 --- a/tests/validation/NEON/ActivationLayer.cpp +++ b/tests/validation/NEON/ActivationLayer.cpp @@ -145,12 +145,10 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(conca } // Validate padding - const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding(); - validate(src.info()->padding(), padding); - + validate(src.info()->padding(), PaddingSize()); if(!in_place) { - validate(dst.info()->padding(), padding); + validate(dst.info()->padding(), PaddingSize()); } } diff --git a/tests/validation/NEON/ArithmeticAddition.cpp b/tests/validation/NEON/ArithmeticAddition.cpp index e66a442d9b..3c734f68ec 100644 --- a/tests/validation/NEON/ArithmeticAddition.cpp +++ b/tests/validation/NEON/ArithmeticAddition.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -18,7 +18,7 @@ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONCLCTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "arm_compute/core/Types.h" @@ -71,26 +71,30 @@ using NEArithmeticAdditionFixture = ArithmeticAdditionValidationFixture<Tensor, DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), - TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8), // Window shrink - TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Invalid data type combination - TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), // Mismatching shapes + TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8), // Unsupported broadcast + TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), // Invalid data type combination + TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),// Mismatching shapes }), framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), - TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8), + TensorInfo(TensorShape(1U, 13U, 2U), 1, DataType::S16), TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16), TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32), })), framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16), TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), - TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8), + TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::S16), TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8), TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32), })), framework::dataset::make("Expected", { true, true, false, false, false})), input1_info, input2_info, output_info, expected) { - ARM_COMPUTE_EXPECT(bool(NEArithmeticAddition::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), ConvertPolicy::WRAP)) == expected, framework::LogLevel::ERRORS); + Status s = NEArithmeticAddition::validate(&input1_info.clone()->set_is_resizable(false), + &input2_info.clone()->set_is_resizable(false), + &output_info.clone()->set_is_resizable(false), + ConvertPolicy::WRAP); + ARM_COMPUTE_EXPECT(bool(s) == expected, framework::LogLevel::ERRORS); } // clang-format on // *INDENT-ON* @@ -114,10 +118,9 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::da validate(dst.info()->valid_region(), valid_region); // Validate padding - const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding(); - validate(ref_src1.info()->padding(), padding); - validate(ref_src2.info()->padding(), padding); - validate(dst.info()->padding(), padding); + validate(ref_src1.info()->padding(), PaddingSize()); + validate(ref_src2.info()->padding(), PaddingSize()); + validate(dst.info()->padding(), PaddingSize()); } FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticAdditionU8Dataset), @@ -147,10 +150,9 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(frame validate(dst.info()->valid_region(), valid_region); // Validate padding - const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding(); - validate(ref_src1.info()->padding(), padding); - validate(ref_src2.info()->padding(), padding); - validate(dst.info()->padding(), padding); + validate(ref_src1.info()->padding(), PaddingSize()); + validate(ref_src2.info()->padding(), PaddingSize()); + validate(dst.info()->padding(), PaddingSize()); } FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticAdditionS16Dataset), @@ -199,10 +201,9 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::da validate(dst.info()->valid_region(), valid_region); // Validate padding - const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding(); - validate(ref_src1.info()->padding(), padding); - validate(ref_src2.info()->padding(), padding); - validate(dst.info()->padding(), padding); + validate(ref_src1.info()->padding(), PaddingSize()); + validate(ref_src2.info()->padding(), PaddingSize()); + validate(dst.info()->padding(), PaddingSize()); } FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticAdditionFP32Dataset), @@ -262,10 +263,9 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::da validate(dst.info()->valid_region(), valid_region); // Validate padding - const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding(); - validate(ref_src1.info()->padding(), padding); - validate(ref_src2.info()->padding(), padding); - validate(dst.info()->padding(), padding); + validate(ref_src1.info()->padding(), PaddingSize()); + validate(ref_src2.info()->padding(), PaddingSize()); + validate(dst.info()->padding(), PaddingSize()); } FIXTURE_DATA_TEST_CASE(RunSmall, |