diff options
Diffstat (limited to 'src/core/NEON/wrapper/intrinsics')
27 files changed, 352 insertions, 81 deletions
diff --git a/src/core/NEON/wrapper/intrinsics/cvt.h b/src/core/NEON/wrapper/intrinsics/cvt.h index 6e79a92bc2..381de2284a 100644 --- a/src/core/NEON/wrapper/intrinsics/cvt.h +++ b/src/core/NEON/wrapper/intrinsics/cvt.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020, 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,12 +30,11 @@ namespace arm_compute { namespace wrapper { -#define VCVT_TO_F32_IMPL(ptype, vtype, prefix, postfix1, postfix2) \ - template <typename T> \ - inline typename std::enable_if<std::is_same<T, float>::value, float32x4_t>::type \ - vcvt(const vtype &a) \ - { \ - return prefix##_##postfix1##_##postfix2(a); \ +#define VCVT_TO_F32_IMPL(ptype, vtype, prefix, postfix1, postfix2) \ + template <typename T> \ + inline typename std::enable_if<std::is_same<T, float>::value, float32x4_t>::type vcvt(const vtype &a) \ + { \ + return prefix##_##postfix1##_##postfix2(a); \ } VCVT_TO_F32_IMPL(float32x4_t, uint32x4_t, vcvtq, f32, u32) @@ -46,12 +45,11 @@ VCVT_TO_F32_IMPL(float32x4_t, float16x4_t, vcvt, f32, f16) #undef VCVT_TO_F32_IMPL #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#define VCVT_TO_F16_IMPL(ptype, vtype, prefix, postfix1, postfix2) \ - template <typename T> \ - inline typename std::enable_if<std::is_same<T, float16_t>::value, float16x4_t>::type \ - vcvt(const vtype &a) \ - { \ - return prefix##_##postfix1##_##postfix2(a); \ +#define VCVT_TO_F16_IMPL(ptype, vtype, prefix, postfix1, postfix2) \ + template <typename T> \ + inline typename std::enable_if<std::is_same<T, float16_t>::value, float16x4_t>::type vcvt(const vtype &a) \ + { \ + return prefix##_##postfix1##_##postfix2(a); \ } VCVT_TO_F16_IMPL(float16x4_t, float32x4_t, vcvt, f16, f32) @@ -59,20 +57,34 @@ VCVT_TO_F16_IMPL(float16x4_t, float32x4_t, vcvt, f16, f32) #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC template <typename T> -inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint32x4_t>::type +inline typename std::enable_if<std::is_same<T, uint8_t>::value || std::is_same<T, uint32_t>::value, uint32x4_t>::type vcvt(const float32x4_t &a) { return vcvtq_u32_f32(a); } template <typename T> -inline typename std::enable_if<std::is_same<T, int8_t>::value, int32x4_t>::type +inline typename std::enable_if<std::is_same<T, int8_t>::value || std::is_same<T, int32_t>::value, int32x4_t>::type vcvt(const float32x4_t &a) { return vcvtq_s32_f32(a); } -#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) +#ifdef __aarch64__ +template <typename T> +inline typename std::enable_if<std::is_same<T, uint32_t>::value, uint32x4_t>::type vcvta(const float32x4_t &a) +{ + return vcvtaq_u32_f32(a); +} + +template <typename T> +inline typename std::enable_if<std::is_same<T, int32_t>::value, int32x4_t>::type vcvta(const float32x4_t &a) +{ + return vcvtaq_s32_f32(a); +} +#endif //__aarch64__ + +#if defined(ARM_COMPUTE_ENABLE_BF16) /** Convert 2x128-bit floating point vectors into 1x128-bit bfloat16 vector * * @param[in] inptr Pointer to the input memory to load values from @@ -80,16 +92,15 @@ vcvt(const float32x4_t &a) */ inline void vcvt_bf16_f32(const float *inptr, uint16_t *outptr) { - __asm __volatile( - "ldp q0, q1, [%[inptr]]\n" - ".inst 0xea16800\n" // BFCVTN v0, v0 - ".inst 0x4ea16820\n" // BFCVTN2 v0, v1 - "str q0, [%[outptr]]\n" - : [inptr] "+r"(inptr) - : [outptr] "r"(outptr) - : "v0", "v1", "memory"); + __asm __volatile("ldp q0, q1, [%[inptr]]\n" + ".inst 0xea16800\n" // BFCVTN v0, v0 + ".inst 0x4ea16820\n" // BFCVTN2 v0, v1 + "str q0, [%[outptr]]\n" + : [inptr] "+r"(inptr) + : [outptr] "r"(outptr) + : "v0", "v1", "memory"); } -#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */ +#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ } // namespace wrapper } // namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/div.h b/src/core/NEON/wrapper/intrinsics/div.h index 265f30d33b..ece991a5b0 100644 --- a/src/core/NEON/wrapper/intrinsics/div.h +++ b/src/core/NEON/wrapper/intrinsics/div.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_DIV_H #include "src/core/NEON/NEMath.h" + #include <arm_neon.h> namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/erf.h b/src/core/NEON/wrapper/intrinsics/erf.h new file mode 100644 index 0000000000..0e34462b96 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/erf.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ARM_COMPUTE_WRAPPER_ERF_H +#define ARM_COMPUTE_WRAPPER_ERF_H + +#include "src/core/NEON/NEMath.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace wrapper +{ +#define VERF_IMPL(vtype, prefix, postfix) \ + inline vtype verf(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +VERF_IMPL(float32x4_t, verfq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VERF_IMPL(float16x8_t, verfq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VERF_IMPL + +} // namespace wrapper +} // namespace arm_compute + +#endif /* ARM_COMPUTE_WRAPPER_ERF_H */ diff --git a/src/core/NEON/wrapper/intrinsics/exp.h b/src/core/NEON/wrapper/intrinsics/exp.h index c2a6970967..f44577b926 100644 --- a/src/core/NEON/wrapper/intrinsics/exp.h +++ b/src/core/NEON/wrapper/intrinsics/exp.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_EXP_H #include "src/core/NEON/NEMath.h" + #include <arm_neon.h> namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/getlane.h b/src/core/NEON/wrapper/intrinsics/getlane.h index 2052751612..ae813bb2fa 100644 --- a/src/core/NEON/wrapper/intrinsics/getlane.h +++ b/src/core/NEON/wrapper/intrinsics/getlane.h @@ -33,7 +33,7 @@ namespace wrapper #define VGETLANE_IMPL_8(stype, vtype, postfix) \ inline stype vgetlane(const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vget_lane_##postfix(vector, 0); \ @@ -59,7 +59,7 @@ namespace wrapper #define VGETLANE_IMPL_4(stype, vtype, postfix) \ inline stype vgetlane(const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vget_lane_##postfix(vector, 0); \ @@ -77,7 +77,7 @@ namespace wrapper #define VGETLANE_IMPL_2(stype, vtype, postfix) \ inline stype vgetlane(const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vget_lane_##postfix(vector, 0); \ @@ -102,7 +102,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16) #define VGETQLANE_IMPL_16(stype, vtype, postfix) \ inline stype vgetlane(const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vgetq_lane_##postfix(vector, 0); \ @@ -144,7 +144,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16) #define VGETQLANE_IMPL_8(stype, vtype, postfix) \ inline stype vgetlane(const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vgetq_lane_##postfix(vector, 0); \ @@ -170,7 +170,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16) #define VGETQLANE_IMPL_4(stype, vtype, postfix) \ inline stype vgetlane(const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vgetq_lane_##postfix(vector, 0); \ @@ -188,7 +188,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16) #define VGETQLANE_IMPL_2(stype, vtype, postfix) \ inline stype vgetlane(const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vgetq_lane_##postfix(vector, 0); \ diff --git a/src/core/NEON/wrapper/intrinsics/intrinsics.h b/src/core/NEON/wrapper/intrinsics/intrinsics.h index 871d9cc5ac..97975ebe7c 100644 --- a/src/core/NEON/wrapper/intrinsics/intrinsics.h +++ b/src/core/NEON/wrapper/intrinsics/intrinsics.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -39,6 +39,7 @@ #include "src/core/NEON/wrapper/intrinsics/div.h" #include "src/core/NEON/wrapper/intrinsics/dup_n.h" #include "src/core/NEON/wrapper/intrinsics/eor.h" +#include "src/core/NEON/wrapper/intrinsics/erf.h" #include "src/core/NEON/wrapper/intrinsics/exp.h" #include "src/core/NEON/wrapper/intrinsics/ext.h" #include "src/core/NEON/wrapper/intrinsics/gethigh.h" @@ -66,6 +67,7 @@ #include "src/core/NEON/wrapper/intrinsics/rev64.h" #include "src/core/NEON/wrapper/intrinsics/round.h" #include "src/core/NEON/wrapper/intrinsics/setlane.h" +#include "src/core/NEON/wrapper/intrinsics/shr.h" #include "src/core/NEON/wrapper/intrinsics/sin.h" #include "src/core/NEON/wrapper/intrinsics/sqrt.h" #include "src/core/NEON/wrapper/intrinsics/store.h" diff --git a/src/core/NEON/wrapper/intrinsics/inv.h b/src/core/NEON/wrapper/intrinsics/inv.h index de398b0403..e443be679b 100644 --- a/src/core/NEON/wrapper/intrinsics/inv.h +++ b/src/core/NEON/wrapper/intrinsics/inv.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_INV_H #include "src/core/NEON/NEMath.h" + #include <arm_neon.h> namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/invsqrt.h b/src/core/NEON/wrapper/intrinsics/invsqrt.h index 2343efa8f8..257b445cc7 100644 --- a/src/core/NEON/wrapper/intrinsics/invsqrt.h +++ b/src/core/NEON/wrapper/intrinsics/invsqrt.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_INVSQRT_H #include "src/core/NEON/NEMath.h" + #include <arm_neon.h> namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/log.h b/src/core/NEON/wrapper/intrinsics/log.h index 357a77ca78..d091407edb 100644 --- a/src/core/NEON/wrapper/intrinsics/log.h +++ b/src/core/NEON/wrapper/intrinsics/log.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_LOG_H #include "src/core/NEON/NEMath.h" + #include <arm_neon.h> namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/max.h b/src/core/NEON/wrapper/intrinsics/max.h index cec437d171..32d38a856c 100644 --- a/src/core/NEON/wrapper/intrinsics/max.h +++ b/src/core/NEON/wrapper/intrinsics/max.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2020, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_WRAPPER_MAX_H -#define ARM_COMPUTE_WRAPPER_MAX_H +#ifndef ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H +#define ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H #include <arm_neon.h> @@ -59,6 +59,39 @@ VMAX_IMPL(float16_t, float16x8_t, vmaxq, f16) #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #undef VMAX_IMPL + +#if defined(__aarch64__) +// VMAXV: Across vector max +#define VMAXV_IMPL(stype, vtype, prefix, postfix) \ + inline stype vmaxv(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +VMAXV_IMPL(uint8_t, uint8x8_t, vmaxv, u8) +VMAXV_IMPL(int8_t, int8x8_t, vmaxv, s8) +VMAXV_IMPL(uint16_t, uint16x4_t, vmaxv, u16) +VMAXV_IMPL(int16_t, int16x4_t, vmaxv, s16) +VMAXV_IMPL(uint32_t, uint32x2_t, vmaxv, u32) +VMAXV_IMPL(int32_t, int32x2_t, vmaxv, s32) +VMAXV_IMPL(float, float32x2_t, vmaxv, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VMAXV_IMPL(float16_t, float16x4_t, vmaxv, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VMAXV_IMPL(uint8_t, uint8x16_t, vmaxvq, u8) +VMAXV_IMPL(int8_t, int8x16_t, vmaxvq, s8) +VMAXV_IMPL(uint16_t, uint16x8_t, vmaxvq, u16) +VMAXV_IMPL(int16_t, int16x8_t, vmaxvq, s16) +VMAXV_IMPL(uint32_t, uint32x4_t, vmaxvq, u32) +VMAXV_IMPL(int32_t, int32x4_t, vmaxvq, s32) +VMAXV_IMPL(float, float32x4_t, vmaxvq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VMAXV_IMPL(float16_t, float16x8_t, vmaxvq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VMAXV_IMPL +#endif // defined(__aarch64__) } // namespace wrapper } // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_MAX_H */ +#endif // ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H diff --git a/src/core/NEON/wrapper/intrinsics/pow.h b/src/core/NEON/wrapper/intrinsics/pow.h index 61f834ed23..dfd6ccc358 100644 --- a/src/core/NEON/wrapper/intrinsics/pow.h +++ b/src/core/NEON/wrapper/intrinsics/pow.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_POW_H #include "src/core/NEON/NEMath.h" + #include <arm_neon.h> namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/qmov.h b/src/core/NEON/wrapper/intrinsics/qmov.h index 167f3cf43b..9a0a23a241 100644 --- a/src/core/NEON/wrapper/intrinsics/qmov.h +++ b/src/core/NEON/wrapper/intrinsics/qmov.h @@ -31,15 +31,13 @@ namespace arm_compute namespace wrapper { template <typename T> -inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x8_t>::type -vqmov(const int16x8_t &a) +inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x8_t>::type vqmov(const int16x8_t &a) { return vqmovun_s16(a); } template <typename T> -inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x8_t>::type -vqmov(const int16x8_t &a) +inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x8_t>::type vqmov(const int16x8_t &a) { return vqmovn_s16(a); } diff --git a/src/core/NEON/wrapper/intrinsics/reinterpret.h b/src/core/NEON/wrapper/intrinsics/reinterpret.h index cf00a4aceb..c2c4f720d2 100644 --- a/src/core/NEON/wrapper/intrinsics/reinterpret.h +++ b/src/core/NEON/wrapper/intrinsics/reinterpret.h @@ -35,7 +35,7 @@ namespace wrapper { \ return prefix##_##postfix1##_##postfix2(a); \ } \ - \ + \ inline ptype vreinterpret(const ptype &a) \ { \ return a; \ diff --git a/src/core/NEON/wrapper/intrinsics/round.h b/src/core/NEON/wrapper/intrinsics/round.h index d23feb6b42..7789aab770 100644 --- a/src/core/NEON/wrapper/intrinsics/round.h +++ b/src/core/NEON/wrapper/intrinsics/round.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_ROUND_H #include "src/core/NEON/NEMath.h" + #include <arm_neon.h> namespace arm_compute diff --git a/src/core/NEON/wrapper/intrinsics/setlane.h b/src/core/NEON/wrapper/intrinsics/setlane.h index 197eedacb5..259b8eaf90 100644 --- a/src/core/NEON/wrapper/intrinsics/setlane.h +++ b/src/core/NEON/wrapper/intrinsics/setlane.h @@ -33,7 +33,7 @@ namespace wrapper #define VSETLANE_IMPL_8(stype, atype, vtype, postfix) \ inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vset_lane_##postfix(value, vector, 0); \ @@ -59,7 +59,7 @@ namespace wrapper #define VSETLANE_IMPL_4(stype, atype, vtype, postfix) \ inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vset_lane_##postfix(value, vector, 0); \ @@ -77,7 +77,7 @@ namespace wrapper #define VSETLANE_IMPL_2(stype, atype, vtype, postfix) \ inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vset_lane_##postfix(value, vector, 0); \ @@ -102,7 +102,7 @@ VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16) #define VSETQLANE_IMPL_16(stype, atype, vtype, postfix) \ inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vsetq_lane_##postfix(value, vector, 0); \ @@ -144,7 +144,7 @@ VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16) #define VSETQLANE_IMPL_8(stype, atype, vtype, postfix) \ inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vsetq_lane_##postfix(value, vector, 0); \ @@ -170,7 +170,7 @@ VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16) #define VSETQLANE_IMPL_4(stype, atype, vtype, postfix) \ inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \ { \ - switch(lane) \ + switch (lane) \ { \ case 0: \ return vsetq_lane_##postfix(value, vector, 0); \ diff --git a/src/core/NEON/wrapper/intrinsics/shr.h b/src/core/NEON/wrapper/intrinsics/shr.h new file mode 100644 index 0000000000..6ccb9cdf92 --- /dev/null +++ b/src/core/NEON/wrapper/intrinsics/shr.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ARM_COMPUTE_WRAPPER_SHR_H +#define ARM_COMPUTE_WRAPPER_SHR_H + +#include <arm_neon.h> +#include <type_traits> + +namespace arm_compute +{ +namespace wrapper +{ +#define VQRSHRN_IMPL(half_vtype, vtype, prefix, postfix) \ + template <int b> \ + inline half_vtype vqrshrn(const vtype &a) \ + { \ + return prefix##_##postfix(a, b); \ + } +VQRSHRN_IMPL(int8x8_t, int16x8_t, vqrshrn_n, s16) +VQRSHRN_IMPL(uint8x8_t, uint16x8_t, vqrshrn_n, u16) +VQRSHRN_IMPL(int16x4_t, int32x4_t, vqrshrn_n, s32) +VQRSHRN_IMPL(uint16x4_t, uint32x4_t, vqrshrn_n, u32) +VQRSHRN_IMPL(int32x2_t, int64x2_t, vqrshrn_n, s64) +VQRSHRN_IMPL(uint32x2_t, uint64x2_t, vqrshrn_n, u64) + +#undef VQRSHRN_IMPL + +#ifdef __aarch64__ +#define VQRSHRN_SCALAR_IMPL(half_vtype, vtype, prefix, postfix) \ + template <int b> \ + inline half_vtype vqrshrn(const vtype &a) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VQRSHRN_SCALAR_IMPL(int8_t, int16_t, vqrshrnh_n, s16) +VQRSHRN_SCALAR_IMPL(uint8_t, uint16_t, vqrshrnh_n, u16) +VQRSHRN_SCALAR_IMPL(int16_t, int32_t, vqrshrns_n, s32) +VQRSHRN_SCALAR_IMPL(uint16_t, uint32_t, vqrshrns_n, u32) +VQRSHRN_SCALAR_IMPL(int32_t, int64_t, vqrshrnd_n, s64) +VQRSHRN_SCALAR_IMPL(uint32_t, uint64_t, vqrshrnd_n, u64) + +#undef VQRSHRN_SCALAR_IMPL +#endif // __aarch64__ + +// This function is the mixed version of VQRSHRN and VQRSHRUN. +// The input vector is always signed integer, while the returned vector +// can be either signed or unsigned depending on the signedness of scalar type T. +#define VQRSHRN_EX_IMPL(half_vtype, vtype, prefix_signed, prefix_unsigned, postfix) \ + template <int b, typename T> \ + inline typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, half_vtype>::type \ + vqrshrn_ex(const vtype &a) \ + { \ + return prefix_signed##_##postfix(a, b); \ + } \ + \ + template <int b, typename T> \ + inline typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value, u##half_vtype>::type \ + vqrshrn_ex(const vtype &a) \ + { \ + return prefix_unsigned##_##postfix(a, b); \ + } +VQRSHRN_EX_IMPL(int8x8_t, int16x8_t, vqrshrn_n, vqrshrun_n, s16) +VQRSHRN_EX_IMPL(int16x4_t, int32x4_t, vqrshrn_n, vqrshrun_n, s32) +VQRSHRN_EX_IMPL(int32x2_t, int64x2_t, vqrshrn_n, vqrshrun_n, s64) +#undef VQRSHRN_EX_IMPL + +#define VSHR_IMPL(vtype, prefix, postfix) \ + template <int b> \ + inline vtype vshr_n(const vtype &a) \ + { \ + return prefix##_##postfix(a, b); \ + } +VSHR_IMPL(uint8x8_t, vshr_n, u8) +VSHR_IMPL(int8x8_t, vshr_n, s8) +#undef VSHR_IMPL + +#define VSHRQ_IMPL(vtype, prefix, postfix) \ + template <int b> \ + inline vtype vshrq_n(const vtype &a) \ + { \ + return prefix##_##postfix(a, b); \ + } +VSHRQ_IMPL(uint32x4_t, vshrq_n, u32) +VSHRQ_IMPL(int32x4_t, vshrq_n, s32) +#undef VSHRQ_IMPL + +#ifdef __aarch64__ +#define VSHRQ_SCALAR_IMPL(vtype, prefix, postfix) \ + template <int b> \ + inline vtype vshrq_n(const vtype &a) \ + { \ + return prefix##_##postfix(a, b); \ + } +VSHRQ_SCALAR_IMPL(uint32_t, vshrd_n, u64) +VSHRQ_SCALAR_IMPL(int32_t, vshrd_n, s64) + +#undef VSHRQ_SCALAR_IMPL +#endif // __aarch64__ + +#ifdef __aarch64__ +#define VQRSHRN_EX_SCALAR_IMPL(half_vtype, vtype, prefix_signed, prefix_unsigned, postfix) \ + template <int b, typename T> \ + inline typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, half_vtype>::type \ + vqrshrn_ex(const vtype &a) \ + { \ + return prefix_signed##_##postfix(a, b); \ + } \ + \ + template <int b, typename T> \ + inline typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value, u##half_vtype>::type \ + vqrshrn_ex(const vtype &a) \ + { \ + return prefix_unsigned##_##postfix(a, b); \ + } + +VQRSHRN_EX_SCALAR_IMPL(int8_t, int16_t, vqrshrnh_n, vqrshrunh_n, s16) +VQRSHRN_EX_SCALAR_IMPL(int16_t, int32_t, vqrshrns_n, vqrshruns_n, s32) +VQRSHRN_EX_SCALAR_IMPL(int32_t, int64_t, vqrshrnd_n, vqrshrund_n, s64) + +#undef VQRSHRN_EX_IMPL +#endif // __aarch64__ + +} // namespace wrapper +} // namespace arm_compute +#endif /* ARM_COMPUTE_WRAPPER_SHR_H */ diff --git a/src/core/NEON/wrapper/intrinsics/sin.h b/src/core/NEON/wrapper/intrinsics/sin.h index 03c2813a32..d24fdfa816 100644 --- a/src/core/NEON/wrapper/intrinsics/sin.h +++ b/src/core/NEON/wrapper/intrinsics/sin.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_SIN_H #include "src/core/NEON/NEMath.h" + #include <arm_neon.h> namespace arm_compute @@ -54,4 +55,4 @@ VSIN_IMPL_INT(int32x4_t, vsinq, s32) #undef vsub_IMPL } // namespace wrapper } // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_SUB_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_WRAPPER_SUB_H */ diff --git a/src/core/NEON/wrapper/intrinsics/store.h b/src/core/NEON/wrapper/intrinsics/store.h index 6dda432ea9..ce1b9a554e 100644 --- a/src/core/NEON/wrapper/intrinsics/store.h +++ b/src/core/NEON/wrapper/intrinsics/store.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2020, 2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -44,8 +44,6 @@ VSTORE_IMPL(uint16_t, uint16x4_t, vst1, u16) VSTORE_IMPL(int16_t, int16x4_t, vst1, s16) VSTORE_IMPL(uint32_t, uint32x2_t, vst1, u32) VSTORE_IMPL(int32_t, int32x2_t, vst1, s32) -//VSTORE_IMPL(uint64_t, 1, vst1, u64) -//VSTORE_IMPL(int64_t, 1, vst1, s64) VSTORE_IMPL(float, float32x2_t, vst1, f32) #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC VSTORE_IMPL(float16_t, float16x4_t, vst1, f16) @@ -57,8 +55,6 @@ VSTORE_IMPL(uint16_t, uint16x8_t, vst1q, u16) VSTORE_IMPL(int16_t, int16x8_t, vst1q, s16) VSTORE_IMPL(uint32_t, uint32x4_t, vst1q, u32) VSTORE_IMPL(int32_t, int32x4_t, vst1q, s32) -//VSTORE_IMPL(uint64_t, 2, vst1q, u64) -//VSTORE_IMPL(int64_t, 2, vst1q, s64) VSTORE_IMPL(float, float32x4_t, vst1q, f32) #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC VSTORE_IMPL(float16_t, float16x8_t, vst1q, f16) diff --git a/src/core/NEON/wrapper/intrinsics/sub.h b/src/core/NEON/wrapper/intrinsics/sub.h index 475986d0f6..20436714ef 100644 --- a/src/core/NEON/wrapper/intrinsics/sub.h +++ b/src/core/NEON/wrapper/intrinsics/sub.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2020, 2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -98,6 +98,21 @@ VQSUB_IMPL(float16x8_t, float16x8_t, vsubq, f16) #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #undef VQSUB_IMPL +#define VSUBL_IMPL(rtype, vtype, prefix, postfix) \ + inline rtype vsubl(const vtype &a, const vtype &b) \ + { \ + return prefix##_##postfix(a, b); \ + } + +VSUBL_IMPL(int16x8_t, int8x8_t, vsubl, s8) +VSUBL_IMPL(int32x4_t, int16x4_t, vsubl, s16) +VSUBL_IMPL(int64x2_t, int32x2_t, vsubl, s32) +VSUBL_IMPL(uint16x8_t, uint8x8_t, vsubl, u8) +VSUBL_IMPL(uint32x4_t, uint16x4_t, vsubl, u16) +VSUBL_IMPL(uint64x2_t, uint32x2_t, vsubl, u32) + +#undef VSUB_IMPL + } // namespace wrapper } // namespace arm_compute #endif /* ARM_COMPUTE_WRAPPER_SUB_H */ diff --git a/src/core/NEON/wrapper/intrinsics/svcnt.h b/src/core/NEON/wrapper/intrinsics/svcnt.h index e530e7c83f..c4652504b4 100644 --- a/src/core/NEON/wrapper/intrinsics/svcnt.h +++ b/src/core/NEON/wrapper/intrinsics/svcnt.h @@ -30,7 +30,7 @@ namespace arm_compute namespace wrapper { template <size_t element_size> -inline uint64_t svcnt_size(); +inline uint64_t svcnt_size(); template <> inline uint64_t svcnt_size<64>() @@ -65,4 +65,4 @@ inline uint64_t svcnt() } // namespace arm_compute #endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCNT_H */
\ No newline at end of file +#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCNT_H */ diff --git a/src/core/NEON/wrapper/intrinsics/svcvt.h b/src/core/NEON/wrapper/intrinsics/svcvt.h index 746b004d7d..00ef7b7eb3 100644 --- a/src/core/NEON/wrapper/intrinsics/svcvt.h +++ b/src/core/NEON/wrapper/intrinsics/svcvt.h @@ -29,11 +29,12 @@ namespace arm_compute { namespace wrapper { -#define SVCVT_Z_TO_F32_IMPL(vtype) \ - template <typename T> \ - inline typename std::enable_if<std::is_same<T, float>::value, svfloat32_t>::type svcvt_z(svbool_t pg, const vtype &a) \ - { \ - return svcvt_f32_z(pg, a); \ +#define SVCVT_Z_TO_F32_IMPL(vtype) \ + template <typename T> \ + inline typename std::enable_if<std::is_same<T, float>::value, svfloat32_t>::type svcvt_z(svbool_t pg, \ + const vtype &a) \ + { \ + return svcvt_f32_z(pg, a); \ } SVCVT_Z_TO_F32_IMPL(svuint32_t) @@ -42,11 +43,12 @@ SVCVT_Z_TO_F32_IMPL(svfloat16_t) #undef SVCVT_Z_TO_F32_IMPL -#define SVCVT_Z_TO_F16_IMPL(vtype) \ - template <typename T> \ - inline typename std::enable_if<std::is_same<T, float16_t>::value, svfloat16_t>::type svcvt_z(svbool_t pg, const vtype &a) \ - { \ - return svcvt_f16_z(pg, a); \ +#define SVCVT_Z_TO_F16_IMPL(vtype) \ + template <typename T> \ + inline typename std::enable_if<std::is_same<T, float16_t>::value, svfloat16_t>::type svcvt_z(svbool_t pg, \ + const vtype &a) \ + { \ + return svcvt_f16_z(pg, a); \ } SVCVT_Z_TO_F16_IMPL(svuint32_t) @@ -55,11 +57,12 @@ SVCVT_Z_TO_F16_IMPL(svfloat32_t) #undef SVCVT_Z_TO_F16_IMPL -#define SVCVT_Z_TO_S32_IMPL(vtype) \ - template <typename T> \ - inline typename std::enable_if<std::is_same<T, int32_t>::value, svint32_t>::type svcvt_z(svbool_t pg, const vtype &a) \ - { \ - return svcvt_s32_z(pg, a); \ +#define SVCVT_Z_TO_S32_IMPL(vtype) \ + template <typename T> \ + inline typename std::enable_if<std::is_same<T, int32_t>::value, svint32_t>::type svcvt_z(svbool_t pg, \ + const vtype &a) \ + { \ + return svcvt_s32_z(pg, a); \ } SVCVT_Z_TO_S32_IMPL(svfloat16_t) @@ -71,4 +74,4 @@ SVCVT_Z_TO_S32_IMPL(svfloat32_t) } // namespace arm_compute #endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCVT_H */
\ No newline at end of file +#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCVT_H */ diff --git a/src/core/NEON/wrapper/intrinsics/svdup_n.h b/src/core/NEON/wrapper/intrinsics/svdup_n.h index b1aed97d9c..9c42c86db7 100644 --- a/src/core/NEON/wrapper/intrinsics/svdup_n.h +++ b/src/core/NEON/wrapper/intrinsics/svdup_n.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020, 2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -46,7 +46,9 @@ SVDUP_N_IMPL(uint64_t, svuint64_t, u64) SVDUP_N_IMPL(float16_t, svfloat16_t, f16) SVDUP_N_IMPL(float, svfloat32_t, f32) SVDUP_N_IMPL(float64_t, svfloat64_t, f64) +#if __ARM_FEATURE_SVE_BF16 SVDUP_N_IMPL(bfloat16_t, svbfloat16_t, bf16) +#endif // #if __ARM_FEATURE_SVE_BF16 #undef SVDUP_N_IMPL @@ -54,4 +56,4 @@ SVDUP_N_IMPL(bfloat16_t, svbfloat16_t, bf16) } // namespace arm_compute #endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVDUP_N_H */
\ No newline at end of file +#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVDUP_N_H */ diff --git a/src/core/NEON/wrapper/intrinsics/svexp.h b/src/core/NEON/wrapper/intrinsics/svexp.h index d6ce9a77d1..1e8bce3960 100644 --- a/src/core/NEON/wrapper/intrinsics/svexp.h +++ b/src/core/NEON/wrapper/intrinsics/svexp.h @@ -26,6 +26,7 @@ #if defined(__ARM_FEATURE_SVE) #include "src/core/NEON/SVEMath.h" + #include <arm_sve.h> namespace arm_compute @@ -46,4 +47,4 @@ SVEXP_IMPL(svfloat16_t, f16) } // namespace arm_compute #endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVEXP_H */
\ No newline at end of file +#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVEXP_H */ diff --git a/src/core/NEON/wrapper/intrinsics/svlog.h b/src/core/NEON/wrapper/intrinsics/svlog.h index 5b505ae1e3..b4630e20ed 100644 --- a/src/core/NEON/wrapper/intrinsics/svlog.h +++ b/src/core/NEON/wrapper/intrinsics/svlog.h @@ -25,6 +25,7 @@ #define SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H #if defined(__ARM_FEATURE_SVE) #include "src/core/NEON/SVEMath.h" + #include <arm_sve.h> namespace arm_compute @@ -44,4 +45,4 @@ SVLOG_IMPL(svfloat16_t, f16) } // namespace wrapper } // namespace arm_compute #endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H */
\ No newline at end of file +#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H */ diff --git a/src/core/NEON/wrapper/intrinsics/svptrue.h b/src/core/NEON/wrapper/intrinsics/svptrue.h index 53407e5301..6ed00bccbf 100644 --- a/src/core/NEON/wrapper/intrinsics/svptrue.h +++ b/src/core/NEON/wrapper/intrinsics/svptrue.h @@ -30,7 +30,7 @@ namespace arm_compute namespace wrapper { template <size_t element_size> -inline svbool_t svptrue_size(); +inline svbool_t svptrue_size(); template <> inline svbool_t svptrue_size<64>() @@ -65,4 +65,4 @@ svbool_t svptrue() } // namespace arm_compute #endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVPTRUE_H */
\ No newline at end of file +#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVPTRUE_H */ diff --git a/src/core/NEON/wrapper/intrinsics/svwhilelt.h b/src/core/NEON/wrapper/intrinsics/svwhilelt.h index ef58217dc4..f0f84a9508 100644 --- a/src/core/NEON/wrapper/intrinsics/svwhilelt.h +++ b/src/core/NEON/wrapper/intrinsics/svwhilelt.h @@ -32,7 +32,7 @@ namespace wrapper #define SVWHILELT_IMPL(type) \ template <size_t element_size> \ inline svbool_t svwhilelt_size(type a, type b); \ - \ + \ template <> \ inline svbool_t svwhilelt_size<64>(type a, type b) \ { \ @@ -70,4 +70,4 @@ inline svbool_t svwhilelt(IndexType a, IndexType b) } // namespace arm_compute #endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVWHILELT_H */
\ No newline at end of file +#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVWHILELT_H */ diff --git a/src/core/NEON/wrapper/intrinsics/tanh.h b/src/core/NEON/wrapper/intrinsics/tanh.h index daeaf19997..e74f0e86fe 100644 --- a/src/core/NEON/wrapper/intrinsics/tanh.h +++ b/src/core/NEON/wrapper/intrinsics/tanh.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_WRAPPER_TANH_H #include "src/core/NEON/NEMath.h" + #include <arm_neon.h> namespace arm_compute |