diff options
Diffstat (limited to 'src/core')
36 files changed, 1007 insertions, 675 deletions
diff --git a/src/core/NEON/SVEMath.h b/src/core/NEON/SVEMath.h index b73043a435..dde75e8088 100644 --- a/src/core/NEON/SVEMath.h +++ b/src/core/NEON/SVEMath.h @@ -24,7 +24,7 @@ #ifndef ARM_COMPUTE_SVEMATH_H #define ARM_COMPUTE_SVEMATH_H -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) #include "src/core/NEON/wrapper/intrinsics/svcvt.h" #include "src/core/NEON/wrapper/intrinsics/svdup_n.h" #include "src/core/NEON/wrapper/intrinsics/svreinterpret.h" @@ -185,5 +185,5 @@ int_vec_type convert_float_to_int(const svfloat32_t &in_0, const svfloat32_t &in } // namespace arm_compute #include "src/core/NEON/SVEMath.inl" -#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ #endif /* ARM_COMPUTE_SVEMATH_H */
\ No newline at end of file diff --git a/src/core/NEON/SVEMath.inl b/src/core/NEON/SVEMath.inl index d909adfeb5..7625e5be34 100644 --- a/src/core/NEON/SVEMath.inl +++ b/src/core/NEON/SVEMath.inl @@ -24,7 +24,7 @@ #include <cmath> #include <limits> -#if defined(__ARM_FEATURE_SVE) +#if defined(__ARM_FEATURE_SVE) && defined(ENABLE_SVE) #ifndef M_PI #define M_PI (3.14159265358979323846) @@ -388,4 +388,4 @@ inline svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0, const sv #endif /* defined(__ARM_FEATURE_SVE2) */ } // namespace arm_compute -#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp index 1691943b07..92000bb2f6 100644 --- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp @@ -63,7 +63,7 @@ struct BatchNormalizationKernel static const BatchNormalizationKernel available_kernels[] = { -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) { "fp16_sve_batch_normalization", [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16; }, @@ -74,7 +74,8 @@ static const BatchNormalizationKernel available_kernels[] = [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32; }, REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization) }, -#else /* !defined(__ARM_FEATURE_SVE) */ +#endif /* !defined(ENABLE_SVE) */ +#if defined(ENABLE_NEON) #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { "fp16_neon_batch_normalization", @@ -87,7 +88,7 @@ static const BatchNormalizationKernel available_kernels[] = [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32; }, REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization) }, -#endif /* !defined(__ARM_FEATURE_SVE) */ +#endif /* !defined(ENABLE_NEON) */ }; const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelectorData &data) diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp index 3e3e81d044..a715b9d3ee 100644 --- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp +++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp @@ -29,7 +29,7 @@ #include <cmath> #include <cstddef> -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) #include <arm_sve.h> namespace arm_compute @@ -114,4 +114,4 @@ void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea } } // namespace cpu } // namespace arm_compute -#endif // __ARM_FEATURE_SVE +#endif // ENABLE_SVE diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp index b0d4cbb684..7cc570d8aa 100644 --- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp +++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp @@ -29,7 +29,7 @@ #include <cmath> #include <cstddef> -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) #include <arm_sve.h> namespace arm_compute @@ -114,4 +114,4 @@ void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea } } // namespace cpu } // namespace arm_compute -#endif // __ARM_FEATURE_SVE +#endif // ENABLE_SVE diff --git a/src/core/NEON/wrapper/intrinsics/svpow.h b/src/core/NEON/wrapper/intrinsics/svpow.h index e89a4ab8f6..0f58d758cb 100644 --- a/src/core/NEON/wrapper/intrinsics/svpow.h +++ b/src/core/NEON/wrapper/intrinsics/svpow.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -35,8 +35,16 @@ namespace wrapper return svpow_##postfix##_z(pg, a, b); \ } +#define SVPOW_Z_IMPL_INT(type, postfix) \ + inline type svpow_z(svbool_t pg, const type &a, const type &b) \ + { \ + ARM_COMPUTE_UNUSED(pg, a, b); \ + ARM_COMPUTE_ERROR("Not supported"); \ + } + SVPOW_Z_IMPL(svfloat32_t, f32) SVPOW_Z_IMPL(svfloat16_t, f16) +SVPOW_Z_IMPL_INT(svint16_t, s16) #undef SVPOW_Z_IMPL diff --git a/src/core/NEON/wrapper/svtraits.h b/src/core/NEON/wrapper/svtraits.h index 465983d16f..8d2d660659 100644 --- a/src/core/NEON/wrapper/svtraits.h +++ b/src/core/NEON/wrapper/svtraits.h @@ -23,7 +23,7 @@ */ #ifndef SRC_CORE_NEON_WRAPPER_SVTRAITS_H #define SRC_CORE_NEON_WRAPPER_SVTRAITS_H -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) #include "src/core/NEON/SVEMath.h" #include <arm_sve.h> @@ -66,5 +66,5 @@ DEFINE_TYPES(bfloat16_t) } // namespace wrapper } // namespace arm_compute -#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ #endif /* #ifndef SRC_CORE_NEON_WRAPPER_SVTRAITS_H */ diff --git a/src/core/NEON/wrapper/traits.h b/src/core/NEON/wrapper/traits.h index 3452b76761..81685140f1 100644 --- a/src/core/NEON/wrapper/traits.h +++ b/src/core/NEON/wrapper/traits.h @@ -26,9 +26,9 @@ #include <arm_neon.h> -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) #include <arm_sve.h> -#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ namespace arm_compute { @@ -116,13 +116,13 @@ template <> struct neon_bitvector<float16_t, BitWidth::W128>{ using type = float #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) /** Create the appropriate SVE vector given its type */ template <typename T> struct sve_vector; template <> struct sve_vector<uint8_t>{ using scalar_type = uint8_t; using type = svuint8_t; }; template <> struct sve_vector<int8_t>{ using scalar_type = int8_t; using type = svint8_t; }; -#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ #endif /* DOXYGEN_SKIP_THIS */ diff --git a/src/core/common/Registrars.h b/src/core/common/Registrars.h index 112c83ad94..44ddf9808d 100644 --- a/src/core/common/Registrars.h +++ b/src/core/common/Registrars.h @@ -26,17 +26,17 @@ #if defined(ENABLE_FP16_KERNELS) -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) #define REGISTER_FP16_SVE(func_name) &(func_name) -#else /* !defined(__ARM_FEATURE_SVE) */ +#else /* !defined(ENABLE_SVE) */ #define REGISTER_FP16_SVE(func_name) nullptr -#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) #define REGISTER_FP16_NEON(func_name) &(func_name) -#else /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ +#else /* !defined(ENABLE_NEON) */ #define REGISTER_FP16_NEON(func_name) nullptr -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ +#endif /* defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ #else /* !defined(ENABLE_FP16_KERNELS) */ #define REGISTER_FP16_NEON(func_name) nullptr @@ -44,50 +44,82 @@ #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ #if defined(ENABLE_FP32_KERNELS) -#if defined(__ARM_FEATURE_SVE) + +#if defined(ENABLE_SVE) #define REGISTER_FP32_SVE(func_name) &(func_name) -#endif /* defined(__ARM_FEATURE_SVE) */ +#else /* !defined(ENABLE_SVE) */ +#define REGISTER_FP32_SVE(func_name) nullptr +#endif /* defined(ENABLE_SVE) */ + +#if defined(ENABLE_NEON) #define REGISTER_FP32_NEON(func_name) &(func_name) +#else /* !defined(ENABLE_NEON) */ +#define REGISTER_FP32_NEON(func_name) nullptr +#endif /* defined(ENABLE_NEON) */ + #else /* defined(ENABLE_FP32_KERNELS) */ #define REGISTER_FP32_NEON(func_name) nullptr #define REGISTER_FP32_SVE(func_name) nullptr #endif /* defined(ENABLE_FP32_KERNELS) */ #if defined(ENABLE_QASYMM8_SIGNED_KERNELS) -#if defined(__ARM_FEATURE_SVE) -#define REGISTER_QASYMM8_SIGNED_SVE(func_name) &(func_name) -#endif /* defined(__ARM_FEATURE_SVE) */ + #define REGISTER_QASYMM8_SIGNED_NEON(func_name) &(func_name) + +#if defined(ENABLE_SVE) +#define REGISTER_QASYMM8_SIGNED_SVE(func_name) &(func_name) +#else /* !defined(ENABLE_SVE) */ +#define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr +#endif /* defined(ENABLE_SVE) */ + #else /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */ #define REGISTER_QASYMM8_SIGNED_NEON(func_name) nullptr #define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr #endif /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */ #if defined(ENABLE_QASYMM8_KERNELS) -#if defined(__ARM_FEATURE_SVE) -#define REGISTER_QASYMM8_SVE(func_name) &(func_name) -#endif /* defined(__ARM_FEATURE_SVE) */ #define REGISTER_QASYMM8_NEON(func_name) &(func_name) + +#if defined(ENABLE_SVE) +#define REGISTER_QASYMM8_SVE(func_name) &(func_name) +#else /* !defined(ENABLE_SVE) */ +#define REGISTER_QASYMM8_SVE(func_name) nullptr +#endif /* defined(ENABLE_SVE) */ + #else /* defined(ENABLE_QASYMM8_KERNELS) */ #define REGISTER_QASYMM8_NEON(func_name) nullptr #define REGISTER_QASYMM8_SVE(func_name) nullptr #endif /* defined(ENABLE_QASYMM8_KERNELS) */ #if defined(ENABLE_QSYMM16_KERNELS) -#if defined(__ARM_FEATURE_SVE) -#define REGISTER_QSYMM16_SVE(func_name) &(func_name) -#endif /* defined(__ARM_FEATURE_SVE) */ + #define REGISTER_QSYMM16_NEON(func_name) &(func_name) + +#if defined(ENABLE_SVE) +#define REGISTER_QSYMM16_SVE(func_name) &(func_name) +#else /* !defined(ENABLE_SVE) */ +#define REGISTER_QSYMM16_SVE(func_name) nullptr +#endif /* defined(ENABLE_SVE) */ + #else /* defined(ENABLE_QSYMM16_KERNELS) */ #define REGISTER_QSYMM16_NEON(func_name) nullptr #define REGISTER_QSYMM16_SVE(func_name) nullptr #endif /* defined(ENABLE_QSYMM16_KERNELS) */ #if defined(ENABLE_INTEGER_KERNELS) -#if defined(__ARM_FEATURE_SVE) + +#if defined(ENABLE_SVE) #define REGISTER_INTEGER_SVE(func_name) &(func_name) -#endif /* defined(__ARM_FEATURE_SVE) */ +#else /* !defined(ENABLE_SVE) */ +#define REGISTER_INTEGER_SVE(func_name) nullptr +#endif /* defined(ENABLE_SVE) */ + +#if defined(ENABLE_NEON) #define REGISTER_INTEGER_NEON(func_name) &(func_name) +#else /* !defined(ENABLE_NEON) */ +#define REGISTER_INTEGER_NEON(func_name) nullptr +#endif /* defined(ENABLE_NEON) */ + #else /* defined(ENABLE_INTEGER_KERNELS) */ #define REGISTER_INTEGER_NEON(func_name) nullptr #define REGISTER_INTEGER_SVE(func_name) nullptr diff --git a/src/core/cpu/kernels/CpuActivationKernel.cpp b/src/core/cpu/kernels/CpuActivationKernel.cpp index eb38c18cff..8a57a3b529 100644 --- a/src/core/cpu/kernels/CpuActivationKernel.cpp +++ b/src/core/cpu/kernels/CpuActivationKernel.cpp @@ -60,7 +60,7 @@ struct ActivationKernel static const ActivationKernel available_kernels[] = { -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) { "fp16_sve_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::F16; }, @@ -71,7 +71,8 @@ static const ActivationKernel available_kernels[] = [](const ActivationSelectorData & data) { return data.dt == DataType::F32; }, REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_activation) }, -#else /* !defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ +#if defined(ENABLE_NEON) { "fp16_neon_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::F16; }, @@ -82,9 +83,8 @@ static const ActivationKernel available_kernels[] = [](const ActivationSelectorData & data) { return data.dt == DataType::F32; }, REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_activation) }, -#endif /* defined(__ARM_FEATURE_SVE) */ - -#if defined(__ARM_FEATURE_SVE2) /* defined(__ARM_FEATURE_SVE2) */ +#endif /* defined(ENABLE_NEON) */ +#if defined(__ARM_FEATURE_SVE2) { "qasymm8_sve_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; }, @@ -116,7 +116,7 @@ static const ActivationKernel available_kernels[] = [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; }, REGISTER_QSYMM16_NEON(arm_compute::cpu::qsymm16_neon_activation) }, -#endif /* defined(__ARM_FEATURE_SVE2) */ +#endif /* defined(__ARM_FEATURE_SVE2) */ }; const ActivationKernel *get_implementation(const ActivationSelectorData &data) diff --git a/src/core/cpu/kernels/CpuAddKernel.cpp b/src/core/cpu/kernels/CpuAddKernel.cpp index fc88a7e22d..7afdceae38 100644 --- a/src/core/cpu/kernels/CpuAddKernel.cpp +++ b/src/core/cpu/kernels/CpuAddKernel.cpp @@ -61,7 +61,7 @@ struct AddKernel static const AddKernel available_kernels[] = { -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) { "add_same_sve", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); }, @@ -102,7 +102,8 @@ static const AddKernel available_kernels[] = [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); }, REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_u8_s16_sve) }, -#else /* !defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ +#if defined(ENABLE_NEON) { "add_same_neon", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); }, @@ -145,8 +146,7 @@ static const AddKernel available_kernels[] = [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); }, REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_u8_s16_neon) }, -#endif /* defined(__ARM_FEATURE_SVE) */ - +#endif /* defined(ENABLE_NEON) */ #if defined(__ARM_FEATURE_SVE2) { "add_qasymm8_sve", @@ -179,7 +179,7 @@ static const AddKernel available_kernels[] = [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)); }, REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon) }, -#endif /* defined(__ARM_FEATURE_SVE2) */ +#endif /* defined(ENABLE_NEON) */ }; diff --git a/src/core/cpu/kernels/CpuElementwiseKernel.cpp b/src/core/cpu/kernels/CpuElementwiseKernel.cpp index ddbc48feb8..643a870540 100644 --- a/src/core/cpu/kernels/CpuElementwiseKernel.cpp +++ b/src/core/cpu/kernels/CpuElementwiseKernel.cpp @@ -76,28 +76,31 @@ configure_arithm_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorI ARM_COMPUTE_UNUSED(src1, dst); static ElementwiseKernel kernels[] = { -#if defined(__ARM_FEATURE_SVE) - generate_kernel<DataType::F32>(REGISTER_FP32_SVE((arm_compute::cpu::sve::elementwise_arithmetic_op<op, float32_t>))), - generate_kernel<DataType::S32>(REGISTER_INTEGER_SVE((arm_compute::cpu::sve::elementwise_arithmetic_op<op, int32_t>))), -#else /* defined(__ARM_FEATURE_SVE) */ +#if defined(ENABLE_SVE) + generate_kernel<DataType::F32>(REGISTER_FP32_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float32_t>))), + generate_kernel<DataType::S32>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int32_t>))), + generate_kernel<DataType::S16>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int16_t>))), +#endif /* defined(ENABLE_SVE) */ +#if defined(ENABLE_NEON) generate_kernel<DataType::F32>(REGISTER_FP32_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float, 4>>))), generate_kernel<DataType::S32>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int32_t, 4>>))), -#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_NEON) */ #if defined(__ARM_FEATURE_SVE2) - generate_kernel<DataType::QASYMM8>(REGISTER_QASYMM8_SVE((arm_compute::cpu::sve::elementwise_arithmetic_quantized_op<op, uint8_t>))), - generate_kernel<DataType::QASYMM8_SIGNED>(REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::sve::elementwise_arithmetic_quantized_op<op, int8_t>))), -#else /* defined(__ARM_FEATURE_SVE2) */ + generate_kernel<DataType::QASYMM8>(REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, uint8_t>))), + generate_kernel<DataType::QASYMM8_SIGNED>(REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, int8_t>))), +#else /* !defined(__ARM_FEATURE_SVE2) */ generate_kernel<DataType::QASYMM8>(REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_arithm_op_quantized<op>))), generate_kernel<DataType::QASYMM8_SIGNED>(REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_arithm_op_quantized_signed<op>))), #endif /* defined(__ARM_FEATURE_SVE2) */ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#if defined(__ARM_FEATURE_SVE) - generate_kernel<DataType::F16>(REGISTER_FP16_SVE((arm_compute::cpu::sve::elementwise_arithmetic_op<op, float16_t>))), -#else /* defined(__ARM_FEATURE_SVE) */ +#if defined(ENABLE_SVE) + generate_kernel<DataType::F16>(REGISTER_FP16_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float16_t>))), +#endif /* defined(ENABLE_SVE) */ +#if defined(ENABLE_NEON) +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) generate_kernel<DataType::F16>(REGISTER_FP16_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float16_t, 8>>))), -#endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ generate_kernel<DataType::S16>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int16_t, 8>>))), +#endif /* defined(ENABLE_NEON) */ }; for(const auto &uk : kernels) @@ -118,31 +121,31 @@ configure_comp_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInf ARM_COMPUTE_UNUSED(src1, dst); static ElementwiseKernel kernels[] = { -#if defined(__ARM_FEATURE_SVE) - generate_kernel<DataType::U8, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::sve::elementwise_comparison_op<op, uint8_t>))), - generate_kernel<DataType::F32, DataType::U8>(REGISTER_FP32_SVE((arm_compute::cpu::sve::elementwise_comparison_op<op, float>))), - generate_kernel<DataType::S16, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::sve::elementwise_comparison_op<op, int16_t>))), - generate_kernel<DataType::S32, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::sve::elementwise_comparison_op<op, int32_t>))), -#else /* defined(__ARM_FEATURE_SVE) */ +#if defined(ENABLE_SVE) + generate_kernel<DataType::U8, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, uint8_t>))), + generate_kernel<DataType::F32, DataType::U8>(REGISTER_FP32_SVE((arm_compute::cpu::elementwise_comparison_op<op, float>))), + generate_kernel<DataType::S16, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int16_t>))), + generate_kernel<DataType::S32, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int32_t>))), +#endif /* defined(ENABLE_SVE) */ +#if defined(ENABLE_NEON) generate_kernel<DataType::U8, DataType::U8>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_8<op, uint8_t, uint8x16_t>))), generate_kernel<DataType::F32, DataType::U8>(REGISTER_FP32_NEON((arm_compute::cpu::elementwise_comp_op_32<op, float, float32x4_t>))), generate_kernel<DataType::S16, DataType::U8>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_16<op, int16_t, int16x8_t>))), generate_kernel<DataType::S32, DataType::U8>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_32<op, int32_t, int32x4_t>))), -#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_NEON) */ #if defined(__ARM_FEATURE_SVE2) - generate_kernel<DataType::QASYMM8_SIGNED, DataType::U8>(REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::sve::elementwise_comparison_quantized_op<op, int8_t>))), - generate_kernel<DataType::QASYMM8, DataType::U8>(REGISTER_QASYMM8_SVE((arm_compute::cpu::sve::elementwise_comparison_quantized_op<op, uint8_t>))), -#else /* defined(__ARM_FEATURE_SVE2) */ + generate_kernel<DataType::QASYMM8_SIGNED, DataType::U8>(REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, int8_t>))), + generate_kernel<DataType::QASYMM8, DataType::U8>(REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, uint8_t>))), +#else /* !defined(__ARM_FEATURE_SVE2) */ generate_kernel<DataType::QASYMM8_SIGNED, DataType::U8>(REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_comp_op_quantized_signed<op>))), generate_kernel<DataType::QASYMM8, DataType::U8>(REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_comp_op_quantized<op>))), #endif /* defined(__ARM_FEATURE_SVE2) */ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#if defined(__ARM_FEATURE_SVE) - generate_kernel<DataType::F16, DataType::U8>(REGISTER_FP16_SVE((arm_compute::cpu::sve::elementwise_comparison_op<op, float16_t>))), -#else /* defined(__ARM_FEATURE_SVE) */ +#if defined(ENABLE_SVE) + generate_kernel<DataType::F16, DataType::U8>(REGISTER_FP16_SVE((arm_compute::cpu::elementwise_comparison_op<op, float16_t>))), +#endif /* defined(ENABLE_SVE) */ +#if defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) generate_kernel<DataType::F16, DataType::U8>(REGISTER_FP16_NEON((arm_compute::cpu::elementwise_comp_op_16<op, float16_t, float16x8_t>))), -#endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +#endif /* defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ }; for(const auto &uk : kernels) diff --git a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp index 3a96d93c03..2600a49b70 100644 --- a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp +++ b/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp @@ -54,7 +54,7 @@ struct ElementwiseUnaryKernel static const ElementwiseUnaryKernel available_kernels[] = { -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) { "fp32_sve_elementwise_unary", [](DataType dt) { return dt == DataType::F32; }, @@ -70,7 +70,8 @@ static const ElementwiseUnaryKernel available_kernels[] = [](DataType dt) { return dt == DataType::S32; }, REGISTER_INTEGER_SVE(arm_compute::cpu::elementwise_sve_op<int32_t>), }, -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ENABLE_SVE) +#if defined(ENABLE_NEON) { "fp32_neon_elementwise_unary", [](DataType dt) { return dt == DataType::F32; }, @@ -88,6 +89,7 @@ static const ElementwiseUnaryKernel available_kernels[] = [](DataType dt) { return dt == DataType::S32; }, REGISTER_INTEGER_NEON(arm_compute::cpu::elementwise_op<int32_t>), }, +#endif // defined(ENABLE_NEON) }; const ElementwiseUnaryKernel *get_implementation(DataType dt) diff --git a/src/core/cpu/kernels/CpuScaleKernel.cpp b/src/core/cpu/kernels/CpuScaleKernel.cpp index ed7517111f..29475fa63f 100644 --- a/src/core/cpu/kernels/CpuScaleKernel.cpp +++ b/src/core/cpu/kernels/CpuScaleKernel.cpp @@ -64,38 +64,39 @@ struct ScaleKernel static const ScaleKernel available_kernels[] = { -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) { "fp16_sve_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::fp16_sve_scale) + REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale) }, { "f32_sve_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::fp32_sve_scale) + REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale) }, { "qasymm8_sve_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_sve_scale) + REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale) }, { "qasymm8_signed_sve_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_sve_scale) + REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale) }, { "u8_sve_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::U8; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u8_sve_scale) + REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale) }, { "s16_sve_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::S16; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s16_sve_scale) + REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale) }, -#else /* !defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ +#if defined(ENABLE_NEON) #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { "common_neon_scale", @@ -128,7 +129,7 @@ static const ScaleKernel available_kernels[] = [](const ScaleSelectorData & data) { return data.dt == DataType::S16; }, REGISTER_INTEGER_NEON(arm_compute::cpu::common_neon_scale<int16_t>) }, -#endif /* !defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_NEON) */ }; /** Micro-kernel selector diff --git a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp b/src/core/cpu/kernels/CpuSoftmaxKernel.cpp index d2453ed21d..8ea186b16a 100644 --- a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp +++ b/src/core/cpu/kernels/CpuSoftmaxKernel.cpp @@ -34,8 +34,8 @@ #include "src/core/helpers/WindowHelpers.h" #include "src/core/common/Registrars.h" -#include "src/core/cpu/kernels/softmax/impl/NEON/list.h" -#include "src/core/cpu/kernels/softmax/impl/SVE/list.h" +#include "src/core/cpu/kernels/softmax/impl/neon/list.h" +#include "src/core/cpu/kernels/softmax/impl/sve/list.h" namespace arm_compute { @@ -69,7 +69,7 @@ struct SoftmaxLogits1DMaxKernel static const SoftmaxLogits1DKernel available_logits_1d_kernels[] = { -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) { "sve_softmax_logits_1d_float", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); }, @@ -80,7 +80,9 @@ static const SoftmaxLogits1DKernel available_logits_1d_kernels[] = [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); }, REGISTER_FP16_SVE(arm_compute::cpu::sve_softmax_logits_1d_float<float16_t>) }, -#else /* !defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ + +#if defined(ENABLE_NEON) { "neon_softmax_logits_1d_float", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); }, @@ -93,7 +95,7 @@ static const SoftmaxLogits1DKernel available_logits_1d_kernels[] = REGISTER_FP16_NEON(arm_compute::cpu::neon_softmax_logits_1d_float<float16_t>) }, #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ -#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* !defined(ENABLE_NEON) */ #if defined(__ARM_FEATURE_SVE2) { @@ -123,7 +125,7 @@ static const SoftmaxLogits1DKernel available_logits_1d_kernels[] = static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] = { -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) { "sve_logits_1d_max", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); }, @@ -144,7 +146,8 @@ static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] = [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_logits_1d_max<qasymm8_signed_t>) }, -#else /* !defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ +#if defined(ENABLE_NEON) { "neon_logits_1d_max", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); }, @@ -167,7 +170,7 @@ static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] = [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_logits_1d_max<qasymm8_signed_t>) }, -#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_NEON) */ }; const SoftmaxLogits1DKernel *get_implementation_logits(const SoftmaxSelectorData &data) diff --git a/src/core/cpu/kernels/activation/sve/fp16.cpp b/src/core/cpu/kernels/activation/sve/fp16.cpp index bf31fd7d93..e4be1a4faa 100644 --- a/src/core/cpu/kernels/activation/sve/fp16.cpp +++ b/src/core/cpu/kernels/activation/sve/fp16.cpp @@ -28,7 +28,6 @@ #include <cmath> #include <cstddef> -#if defined(__ARM_FEATURE_SVE) #include "src/core/NEON/SVEMath.h" #include <arm_sve.h> @@ -126,5 +125,4 @@ void fp16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayer input, output); } } // namespace cpu -} // namespace arm_compute -#endif // __ARM_FEATURE_SVE
\ No newline at end of file +} // namespace arm_compute
\ No newline at end of file diff --git a/src/core/cpu/kernels/activation/sve/fp32.cpp b/src/core/cpu/kernels/activation/sve/fp32.cpp index 75f9f8a4c3..f797944435 100644 --- a/src/core/cpu/kernels/activation/sve/fp32.cpp +++ b/src/core/cpu/kernels/activation/sve/fp32.cpp @@ -29,7 +29,6 @@ #include <cmath> #include <cstddef> -#if defined(__ARM_FEATURE_SVE) #include <arm_sve.h> namespace arm_compute @@ -127,5 +126,4 @@ void fp32_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayer input, output); } } // namespace cpu -} // namespace arm_compute -#endif // __ARM_FEATURE_SVE
\ No newline at end of file +} // namespace arm_compute
\ No newline at end of file diff --git a/src/core/cpu/kernels/add/sve/impl.cpp b/src/core/cpu/kernels/add/sve/impl.cpp new file mode 100644 index 0000000000..d1660fe19e --- /dev/null +++ b/src/core/cpu/kernels/add/sve/impl.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + +#include "src/core/NEON/SVEMath.h" +#include "src/core/cpu/kernels/add/sve/impl.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType> +void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + const auto all_true_pg = wrapper::svptrue<ScalarType>(); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + const bool is_sat = (policy == ConvertPolicy::SATURATE); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape())); + Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape())); + Iterator output(dst, window); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + + const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::svdup_n(broadcast_value); + + int x = window_start_x; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x); + auto res = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) : svadd_z(pg, broadcast_value_vec, non_broadcast_v); + svst1(pg, output_ptr + x, res); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + + int x = window_start_x; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto val1 = svld1(pg, input1_ptr + x); + const auto val2 = svld1(pg, input2_ptr + x); + const auto res = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2); + svst1(pg, output_ptr + x, res); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + input1, input2, output); + } +} + +template void add_same_sve<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve<int16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/core/cpu/kernels/add/sve/impl.h b/src/core/cpu/kernels/add/sve/impl.h new file mode 100644 index 0000000000..c38b1d47e0 --- /dev/null +++ b/src/core/cpu/kernels/add/sve/impl.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_ADD_IMPL_H +#define SRC_CORE_SVE_KERNELS_ADD_IMPL_H + +#if defined(ENABLE_SVE) +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType> +void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +} // namespace cpu +} // namespace arm_compute +#endif // defined(ENABLE_SVE) +#endif // SRC_CORE_SVE_KERNELS_ADD_IMPL_H
\ No newline at end of file diff --git a/src/core/cpu/kernels/add/sve/integer.cpp b/src/core/cpu/kernels/add/sve/integer.cpp index ae74bfa3eb..6dec140499 100644 --- a/src/core/cpu/kernels/add/sve/integer.cpp +++ b/src/core/cpu/kernels/add/sve/integer.cpp @@ -25,9 +25,8 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#if defined(__ARM_FEATURE_SVE) #include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" #include <arm_sve.h> namespace arm_compute @@ -197,5 +196,4 @@ void add_u8_s16_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, add_s16_u8_s16_sve(src1, src0, dst, policy, window); } } // namespace cpu -} // namespace arm_compute -#endif /* defined(__ARM_FEATURE_SVE) */
\ No newline at end of file +} // namespace arm_compute
\ No newline at end of file diff --git a/src/core/cpu/kernels/add/sve/list.h b/src/core/cpu/kernels/add/sve/list.h index 71dd875ad8..aebb43bb60 100644 --- a/src/core/cpu/kernels/add/sve/list.h +++ b/src/core/cpu/kernels/add/sve/list.h @@ -24,11 +24,12 @@ #ifndef SRC_CORE_SVE_KERNELS_ADD_LIST_H #define SRC_CORE_SVE_KERNELS_ADD_LIST_H -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" #include "src/core/NEON/SVEMath.h" #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/core/cpu/kernels/add/sve/impl.h" #include <arm_sve.h> namespace arm_compute @@ -47,99 +48,7 @@ DECLARE_ADD_KERNEL(add_u8_u8_s16_sve); #undef DECLARE_ADD_KERNEL -template <typename ScalarType> -void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - const auto all_true_pg = wrapper::svptrue<ScalarType>(); - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); - const bool is_sat = (policy == ConvertPolicy::SATURATE); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - - Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape())); - Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape())); - Iterator output(dst, window); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - - const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::svdup_n(broadcast_value); - - int x = window_start_x; - svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - do - { - const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x); - auto res = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) : svadd_z(pg, broadcast_value_vec, non_broadcast_v); - svst1(pg, output_ptr + x, res); - - x += wrapper::svcnt<ScalarType>(); - pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src0, input1_win); - Iterator input2(src1, input2_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - - int x = window_start_x; - svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - do - { - const auto val1 = svld1(pg, input1_ptr + x); - const auto val2 = svld1(pg, input2_ptr + x); - const auto res = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2); - svst1(pg, output_ptr + x, res); - - x += wrapper::svcnt<ScalarType>(); - pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); - } -} } // namespace cpu } // namespace arm_compute -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ENABLE_SVE) #endif // SRC_CORE_SVE_KERNELS_ADD_LIST_H
\ No newline at end of file diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise.cpp b/src/core/cpu/kernels/elementwise/sve/elementwise.cpp new file mode 100644 index 0000000000..2c3bb0ff7c --- /dev/null +++ b/src/core/cpu/kernels/elementwise/sve/elementwise.cpp @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +using namespace arm_compute::wrapper; + +template <typename InputScalarType, typename OutputScalarType, typename OperatorType> +struct LoopArguments +{ + OperatorType op; + const InputScalarType *input1_ptr; + const InputScalarType *input2_ptr; + OutputScalarType *output_ptr; +}; + +template <typename InputScalarType, typename OutputScalarType, typename OperatorType> +struct BroadcastLoopArguments +{ + OperatorType op; + const InputScalarType *input1_ptr; + InputScalarType broadcast_value; + OutputScalarType *output_ptr; + bool reorder; +}; + +template <typename InputScalarType, typename OutputScalarType> +void arithmetic_op_loop(svbool_t pg, const LoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args) +{ + const auto in1 = svld1(pg, args.input1_ptr); + const auto in2 = svld1(pg, args.input2_ptr); + const auto res = elementwise_arithmetic_op<typename sve_vector<InputScalarType>::type>(pg, in1, in2, args.op); + svst1(pg, args.output_ptr, res); +} + +template <typename InputScalarType, typename OutputScalarType> +void arithmetic_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args) +{ + const auto non_broadcast_vector = svld1(pg, args.input1_ptr); + const auto broadcast_vector = svdup_n(args.broadcast_value); + const auto in1 = args.reorder ? broadcast_vector : non_broadcast_vector; + const auto in2 = args.reorder ? non_broadcast_vector : broadcast_vector; + const auto res = elementwise_arithmetic_op<typename sve_vector<InputScalarType>::type>(pg, in1, in2, args.op); + svst1(pg, args.output_ptr, res); +} + +template <typename InputScalarType, typename OutputScalarType> +void comparison_op_loop(svbool_t pg, const LoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args) +{ + const auto in1 = svld1(pg, args.input1_ptr); + const auto in2 = svld1(pg, args.input2_ptr); + const auto res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, args.op); + const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg); + svst1(output_pg, args.output_ptr, res); +} + +template <typename InputScalarType, typename OutputScalarType> +void comparison_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args) +{ + const auto non_broadcast_vector = svld1(pg, args.input1_ptr); + const auto broadcast_vector = svdup_n(args.broadcast_value); + const auto in1 = args.reorder ? broadcast_vector : non_broadcast_vector; + const auto in2 = args.reorder ? non_broadcast_vector : broadcast_vector; + const auto res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, args.op); + const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg); + svst1(output_pg, args.output_ptr, res); +} + +template <typename InputScalarType, typename OutputScalarType, typename OperatorType> +using LoopFuncType = void (*)(svbool_t, const LoopArguments<InputScalarType, OutputScalarType, OperatorType> &); + +template <typename InputScalarType, typename OutputScalarType, typename OperatorType> +using BroadcastLoopFuncType = void (*)(svbool_t, const BroadcastLoopArguments<InputScalarType, OutputScalarType, OperatorType> &); + +template <typename InputVectorType, typename OutputVectorType, typename OperatorType, + typename InputScalarType = typename sve_scalar<InputVectorType>::type, + typename OutputScalarType = typename sve_scalar<OutputVectorType>::type> +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + OperatorType op, + LoopFuncType<InputScalarType, OutputScalarType, OperatorType> func, + BroadcastLoopFuncType<InputScalarType, OutputScalarType, OperatorType> broadcast_func) +{ + const auto all_true_pg = svptrue<InputScalarType>(); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); + + int x = window_start_x; + + svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x); + do + { + broadcast_func(pg, + { + op, + non_broadcast_input_ptr + x, + broadcast_value, + output_ptr + x, + !is_broadcast_input_2 + }); + x += svcnt<InputScalarType>(); + pg = svwhilelt<InputScalarType>(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr()); + + int x = window_start_x; + + svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x); + do + { + func(pg, + { + op, + input1_ptr + x, + input2_ptr + x, + output_ptr + x + }); + x += svcnt<InputScalarType>(); + pg = svwhilelt<InputScalarType>(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + input1, input2, output); + } +} + +template <ArithmeticOperation op, typename ScalarType> +void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + using VectorType = typename sve_vector<ScalarType>::type; + + elementwise_op<VectorType, VectorType, ArithmeticOperation>(in1, in2, out, window, op, + &arithmetic_op_loop<ScalarType, ScalarType>, + &arithmetic_op_broadcast_loop<ScalarType, ScalarType>); +} + +template <ComparisonOperation op, typename InputScalarType, typename OutputScalarType = uint8_t> +void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); + using InputVectorType = typename sve_vector<InputScalarType>::type; + using OutputVectorType = typename sve_vector<OutputScalarType>::type; + + elementwise_op<InputVectorType, OutputVectorType, ComparisonOperation>(in1, in2, out, window, op, + &comparison_op_loop<InputScalarType, OutputScalarType>, + &comparison_op_broadcast_loop<InputScalarType, OutputScalarType>); +} + +template <> +svint32_t elementwise_pow<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b) +{ + return svcvt_s32_z(pg, svpow_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b))); +} + +template <> +svint32_t elementwise_div<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b) +{ + return svcvt_s32_z(pg, svdiv_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b))); +} + +template <> +svint16_t elementwise_div<svint16_t>(svbool_t &pg, const svint16_t &a, const svint16_t &b) +{ + ARM_COMPUTE_UNUSED(pg, a, b); + ARM_COMPUTE_ERROR("Not supported"); +} + +template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op<ComparisonOperation::Equal, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Equal, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Equal, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Equal, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Equal, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op<ComparisonOperation::NotEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::NotEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::NotEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::NotEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::NotEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op<ComparisonOperation::Greater, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Greater, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Greater, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Greater, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Greater, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op<ComparisonOperation::Less, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Less, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Less, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Less, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Less, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op<ComparisonOperation::LessEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::LessEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::LessEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::LessEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::LessEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_list.h index 83c3355de4..a92a8648a8 100644 --- a/src/core/cpu/kernels/elementwise/sve/elementwise_list.h +++ b/src/core/cpu/kernels/elementwise/sve/elementwise_list.h @@ -23,50 +23,62 @@ */ #ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H #define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/Traits.h" #include "src/core/NEON/SVEMath.h" #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" #include "src/core/NEON/wrapper/svtraits.h" +#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h" #include <arm_sve.h> namespace arm_compute { namespace cpu { -namespace sve -{ using namespace arm_compute::wrapper; template <typename VectorType> -inline VectorType elementwise_pow(svbool_t &pg, const VectorType &a, const VectorType &b) +VectorType elementwise_pow(svbool_t &pg, const VectorType &a, const VectorType &b) { return svpow_z(pg, a, b); } -template <> -inline svint32_t elementwise_pow<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b) -{ - return svcvt_s32_z(pg, svpow_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b))); -} - template <typename VectorType> -inline VectorType elementwise_div(svbool_t &pg, const VectorType &a, const VectorType &b) +VectorType elementwise_div(svbool_t &pg, const VectorType &a, const VectorType &b) { return svdiv_z(pg, a, b); } -template <> -inline svint32_t elementwise_div<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b) +template <uint32_t bytewidth> +svbool_t narrow_to_byte_predicate(svbool_t pg) { - return svcvt_s32_z(pg, svdiv_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b))); + const auto all_false = svpfalse(); + + switch(bytewidth) + { + case 8: + pg = svuzp1_b32(pg, all_false); + /* fall through */ + case 4: + pg = svuzp1_b16(pg, all_false); + /* fall through */ + case 2: + pg = svuzp1_b8(pg, all_false); + /* fall through */ + default: + break; + } + return pg; } template <typename VectorType> -inline VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const VectorType &b, ArithmeticOperation op) +VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const VectorType &b, ArithmeticOperation op) { - using ScalarType = typename sve_scalar<VectorType>::type; + using ScalarType = typename wrapper::sve_scalar<VectorType>::type; VectorType res{}; switch(op) @@ -108,30 +120,8 @@ inline VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, c return res; } -template <uint32_t bytewidth> -inline svbool_t narrow_to_byte_predicate(svbool_t pg) -{ - const auto all_false = svpfalse(); - - switch(bytewidth) - { - case 8: - pg = svuzp1_b32(pg, all_false); - /* fall through */ - case 4: - pg = svuzp1_b16(pg, all_false); - /* fall through */ - case 2: - pg = svuzp1_b8(pg, all_false); - /* fall through */ - default: - break; - } - return pg; -} - template <typename InputVectorType, typename OutputVectorType> -inline OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op) +OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op) { svbool_t selection_vector{}; @@ -159,10 +149,10 @@ inline OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVecto ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); } - using InputScalarType = typename sve_scalar<InputVectorType>::type; + using InputScalarType = typename wrapper::sve_scalar<InputVectorType>::type; selection_vector = narrow_to_byte_predicate<sizeof(InputScalarType)>(selection_vector); - using OutputScalarType = typename sve_scalar<OutputVectorType>::type; + using OutputScalarType = typename wrapper::sve_scalar<OutputVectorType>::type; const auto false_vector = svdup_n(static_cast<OutputScalarType>((uint32_t)0)); const auto true_vector = svdup_n(static_cast<OutputScalarType>(~(uint32_t)0)); auto ret = svsel(selection_vector, true_vector, false_vector); @@ -170,197 +160,12 @@ inline OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVecto return ret; } -template <typename InputScalarType, typename OutputScalarType, typename OperatorType> -struct LoopArguments -{ - OperatorType op; - const InputScalarType *input1_ptr; - const InputScalarType *input2_ptr; - OutputScalarType *output_ptr; -}; - -template <typename InputScalarType, typename OutputScalarType, typename OperatorType> -struct BroadcastLoopArguments -{ - OperatorType op; - const InputScalarType *input1_ptr; - InputScalarType broadcast_value; - OutputScalarType *output_ptr; - bool reorder; -}; - -template <typename InputScalarType, typename OutputScalarType> -inline void arithmetic_op_loop(svbool_t pg, const LoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args) -{ - const auto in1 = svld1(pg, args.input1_ptr); - const auto in2 = svld1(pg, args.input2_ptr); - const auto res = elementwise_arithmetic_op<typename sve_vector<InputScalarType>::type>(pg, in1, in2, args.op); - svst1(pg, args.output_ptr, res); -} - -template <typename InputScalarType, typename OutputScalarType> -inline void arithmetic_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args) -{ - const auto non_broadcast_vector = svld1(pg, args.input1_ptr); - const auto broadcast_vector = svdup_n(args.broadcast_value); - const auto in1 = args.reorder ? broadcast_vector : non_broadcast_vector; - const auto in2 = args.reorder ? non_broadcast_vector : broadcast_vector; - const auto res = elementwise_arithmetic_op<typename sve_vector<InputScalarType>::type>(pg, in1, in2, args.op); - svst1(pg, args.output_ptr, res); -} - -template <typename InputScalarType, typename OutputScalarType> -inline void comparison_op_loop(svbool_t pg, const LoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args) -{ - const auto in1 = svld1(pg, args.input1_ptr); - const auto in2 = svld1(pg, args.input2_ptr); - const auto res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, args.op); - const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg); - svst1(output_pg, args.output_ptr, res); -} - -template <typename InputScalarType, typename OutputScalarType> -inline void comparison_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args) -{ - const auto non_broadcast_vector = svld1(pg, args.input1_ptr); - const auto broadcast_vector = svdup_n(args.broadcast_value); - const auto in1 = args.reorder ? broadcast_vector : non_broadcast_vector; - const auto in2 = args.reorder ? non_broadcast_vector : broadcast_vector; - const auto res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, args.op); - const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg); - svst1(output_pg, args.output_ptr, res); -} - -template <typename InputScalarType, typename OutputScalarType, typename OperatorType> -using LoopFuncType = void (*)(svbool_t, const LoopArguments<InputScalarType, OutputScalarType, OperatorType> &); - -template <typename InputScalarType, typename OutputScalarType, typename OperatorType> -using BroadcastLoopFuncType = void (*)(svbool_t, const BroadcastLoopArguments<InputScalarType, OutputScalarType, OperatorType> &); - -template <typename InputVectorType, typename OutputVectorType, typename OperatorType, - typename InputScalarType = typename sve_scalar<InputVectorType>::type, - typename OutputScalarType = typename sve_scalar<OutputVectorType>::type> -void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - OperatorType op, - LoopFuncType<InputScalarType, OutputScalarType, OperatorType> func, - BroadcastLoopFuncType<InputScalarType, OutputScalarType, OperatorType> broadcast_func) -{ - const auto all_true_pg = svptrue<InputScalarType>(); - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); - const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); - const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); - - int x = window_start_x; - - svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x); - do - { - broadcast_func(pg, - { - op, - non_broadcast_input_ptr + x, - broadcast_value, - output_ptr + x, - !is_broadcast_input_2 - }); - x += svcnt<InputScalarType>(); - pg = svwhilelt<InputScalarType>(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); - const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr()); - - int x = window_start_x; - - svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x); - do - { - func(pg, - { - op, - input1_ptr + x, - input2_ptr + x, - output_ptr + x - }); - x += svcnt<InputScalarType>(); - pg = svwhilelt<InputScalarType>(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); - } -} - template <ArithmeticOperation op, typename ScalarType> -void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - using VectorType = typename sve_vector<ScalarType>::type; - - elementwise_op<VectorType, VectorType, ArithmeticOperation>(in1, in2, out, window, op, - &arithmetic_op_loop<ScalarType, ScalarType>, - &arithmetic_op_broadcast_loop<ScalarType, ScalarType>); -} - -template <ComparisonOperation op, typename InputScalarType, typename OutputScalarType = uint8_t> -void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); - using InputVectorType = typename sve_vector<InputScalarType>::type; - using OutputVectorType = typename sve_vector<OutputScalarType>::type; - - elementwise_op<InputVectorType, OutputVectorType, ComparisonOperation>(in1, in2, out, window, op, - &comparison_op_loop<InputScalarType, OutputScalarType>, - &comparison_op_broadcast_loop<InputScalarType, OutputScalarType>); -} +void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -} // namespace sve +template <ComparisonOperation op, typename ScalarType, typename OutputScalarType = uint8_t> +void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); } // namespace cpu } // namespace arm_compute -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ENABLE_SVE) #endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H */ diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h index b6342c727c..6c5524e284 100644 --- a/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h +++ b/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h @@ -26,14 +26,13 @@ #if defined(__ARM_FEATURE_SVE2) +#include "src/core/NEON/wrapper/svtraits.h" #include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h" namespace arm_compute { namespace cpu { -namespace sve -{ using namespace arm_compute::wrapper; template <typename InputScalarType, typename OutputScalarType, typename OperatorType> @@ -176,7 +175,7 @@ inline void comparison_op_quantized_loop(svbool_t pg, const QuantizedLoopArgumen const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale); - using OutputVectorType = typename sve_vector<OutputScalarType>::type; + using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type; const auto result = svcreate4( elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), args.op), @@ -200,7 +199,7 @@ inline void comparison_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQ const auto &af = args.reorder ? in2 : in1; const auto &bf = args.reorder ? in1 : in2; - using OutputVectorType = typename sve_vector<OutputScalarType>::type; + using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type; const auto result = svcreate4( elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 0), svget4(bf, 0), args.op), @@ -221,8 +220,8 @@ template <typename InputScalarType, typename OutputScalarType, typename Operator using BroadcastQuantizedLoopFuncType = void (*)(svbool_t, const BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType> &); template <typename InputVectorType, typename OutputVectorType, typename OperatorType, - typename InputScalarType = typename sve_scalar<InputVectorType>::type, - typename OutputScalarType = typename sve_scalar<OutputVectorType>::type> + typename InputScalarType = typename wrapper::sve_scalar<InputVectorType>::type, + typename OutputScalarType = typename wrapper::sve_scalar<OutputVectorType>::type> void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, OperatorType op, LoopQuantizedFuncType<InputScalarType, OutputScalarType, OperatorType> func, @@ -344,7 +343,7 @@ void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *o template <ArithmeticOperation op, typename ScalarType> void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { - using VectorType = typename sve_vector<ScalarType>::type; + using VectorType = typename wrapper::traits::sve_vector<ScalarType>::type; elementwise_quantized_op<VectorType, VectorType, ArithmeticOperation>(in1, in2, out, window, op, &arithmetic_op_quantized_loop<ScalarType, ScalarType>, &arithmetic_op_broadcast_quantized_loop<ScalarType, ScalarType>); @@ -354,14 +353,12 @@ template <ComparisonOperation op, typename InputScalarType, typename OutputScala void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); - using InputVectorType = typename sve_vector<InputScalarType>::type; - using OutputVectorType = typename sve_vector<OutputScalarType>::type; + using InputVectorType = typename wrapper::traits::sve_vector<InputScalarType>::type; + using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type; elementwise_quantized_op<InputVectorType, OutputVectorType, ComparisonOperation>(in1, in2, out, window, op, &comparison_op_quantized_loop<InputScalarType, OutputScalarType>, &comparison_op_broadcast_quantized_loop<InputScalarType, OutputScalarType>); } - -} // namespace sve } // namespace cpu } // namespace arm_compute diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp b/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp new file mode 100644 index 0000000000..cb58548f0b --- /dev/null +++ b/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType, typename VectorType> +inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) +{ + switch(op) + { + case ElementWiseUnary::RSQRT: + return svinvsqrt(pg, a); + case ElementWiseUnary::EXP: + return wrapper::svexp_z(pg, a); + case ElementWiseUnary::NEG: + return svneg_z(pg, a); + case ElementWiseUnary::LOG: + return wrapper::svlog_z(pg, a); + case ElementWiseUnary::ABS: + return svabs_z(pg, a); + case ElementWiseUnary::ROUND: + return svrintn_z(pg, a); + case ElementWiseUnary::SIN: + return wrapper::svsin_z(pg, a); + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED"); + } +} + +template <typename ScalarType, typename VectorType> +inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) +{ + switch(op) + { + case ElementWiseUnary::NEG: + return svneg_z(pg, a); + case ElementWiseUnary::ABS: + return svabs_z(pg, a); + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED"); + } +} + +template <typename ScalarType> +void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + const auto all_true_pg = wrapper::svptrue<ScalarType>(); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(in, win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto vin = svld1(pg, input_ptr + x); + svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin)); + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + input, output); +} + +template void elementwise_sve_op<float16_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); +template void elementwise_sve_op<float32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); +template void elementwise_sve_op<int32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h index 23502c71e5..63490421e9 100644 --- a/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h +++ b/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h @@ -25,87 +25,15 @@ #define SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H #include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#if defined(__ARM_FEATURE_SVE) -#include "src/core/NEON/SVEMath.h" -#include <arm_sve.h> +#if defined(ENABLE_SVE) namespace arm_compute { namespace cpu { -template <typename ScalarType, typename VectorType> -inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) -{ - switch(op) - { - case ElementWiseUnary::RSQRT: - return svinvsqrt(pg, a); - case ElementWiseUnary::EXP: - return wrapper::svexp_z(pg, a); - case ElementWiseUnary::NEG: - return svneg_z(pg, a); - case ElementWiseUnary::LOG: - return wrapper::svlog_z(pg, a); - case ElementWiseUnary::ABS: - return svabs_z(pg, a); - case ElementWiseUnary::ROUND: - return svrintn_z(pg, a); - case ElementWiseUnary::SIN: - return wrapper::svsin_z(pg, a); - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED"); - } -} - -template <typename ScalarType, typename VectorType> -inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) -{ - switch(op) - { - case ElementWiseUnary::NEG: - return svneg_z(pg, a); - case ElementWiseUnary::ABS: - return svabs_z(pg, a); - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED"); - } -} - template <typename ScalarType> -void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) -{ - const auto all_true_pg = wrapper::svptrue<ScalarType>(); - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(in, win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); - int x = window_start_x; - - svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - do - { - const auto vin = svld1(pg, input_ptr + x); - svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin)); - x += wrapper::svcnt<ScalarType>(); - pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input, output); -} - +void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); } // namespace cpu } // namespace arm_compute -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ENABLE_SVE) #endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
\ No newline at end of file diff --git a/src/core/cpu/kernels/floor/NEON/fp16.cpp b/src/core/cpu/kernels/floor/neon/fp16.cpp index f362676a36..f362676a36 100644 --- a/src/core/cpu/kernels/floor/NEON/fp16.cpp +++ b/src/core/cpu/kernels/floor/neon/fp16.cpp diff --git a/src/core/cpu/kernels/floor/NEON/fp32.cpp b/src/core/cpu/kernels/floor/neon/fp32.cpp index f5efb2e849..f5efb2e849 100644 --- a/src/core/cpu/kernels/floor/NEON/fp32.cpp +++ b/src/core/cpu/kernels/floor/neon/fp32.cpp diff --git a/src/core/cpu/kernels/scale/sve/fp16.cpp b/src/core/cpu/kernels/scale/sve/fp16.cpp index 99f08dbdf9..5b9377c6e6 100644 --- a/src/core/cpu/kernels/scale/sve/fp16.cpp +++ b/src/core/cpu/kernels/scale/sve/fp16.cpp @@ -21,6 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ + +#if defined(ENABLE_SVE) #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" @@ -30,12 +32,10 @@ #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" +#include <arm_sve.h> #include <cmath> #include <cstddef> -#if defined(__ARM_FEATURE_SVE) -#include <arm_sve.h> - namespace arm_compute { namespace @@ -173,4 +173,4 @@ void fp16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, co } // namespace cpu } // namespace arm_compute -#endif // __ARM_FEATURE_SVE
\ No newline at end of file +#endif // ENABLE_SVE
\ No newline at end of file diff --git a/src/core/cpu/kernels/scale/sve/fp32.cpp b/src/core/cpu/kernels/scale/sve/fp32.cpp index 94055ae953..05fbedf20d 100644 --- a/src/core/cpu/kernels/scale/sve/fp32.cpp +++ b/src/core/cpu/kernels/scale/sve/fp32.cpp @@ -21,6 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#if defined(ENABLE_SVE) #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" @@ -33,7 +34,6 @@ #include <cmath> #include <cstddef> -#if defined(__ARM_FEATURE_SVE) #include <arm_sve.h> namespace arm_compute @@ -171,4 +171,4 @@ void fp32_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, co } // namespace cpu } // namespace arm_compute -#endif // __ARM_FEATURE_SVE
\ No newline at end of file +#endif // ENABLE_SVE
\ No newline at end of file diff --git a/src/core/cpu/kernels/scale/sve/integer.cpp b/src/core/cpu/kernels/scale/sve/integer.cpp index 2a724ece31..d7e270c661 100644 --- a/src/core/cpu/kernels/scale/sve/integer.cpp +++ b/src/core/cpu/kernels/scale/sve/integer.cpp @@ -21,6 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#if defined(ENABLE_SVE) #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" @@ -30,12 +31,10 @@ #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" +#include <arm_sve.h> #include <cmath> #include <cstddef> -#if defined(__ARM_FEATURE_SVE) -#include <arm_sve.h> - namespace arm_compute { namespace @@ -298,4 +297,4 @@ void s16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, con } // namespace cpu } // namespace arm_compute -#endif // __ARM_FEATURE_SVE
\ No newline at end of file +#endif // ENABLE_SVE
\ No newline at end of file diff --git a/src/core/cpu/kernels/scale/sve/qasymm8.cpp b/src/core/cpu/kernels/scale/sve/qasymm8.cpp index c041f14b22..f747037938 100644 --- a/src/core/cpu/kernels/scale/sve/qasymm8.cpp +++ b/src/core/cpu/kernels/scale/sve/qasymm8.cpp @@ -21,6 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#if defined(ENABLE_SVE) #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" @@ -31,12 +32,10 @@ #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" +#include <arm_sve.h> #include <cmath> #include <cstddef> -#if defined(__ARM_FEATURE_SVE) -#include <arm_sve.h> - namespace arm_compute { namespace @@ -90,8 +89,8 @@ void qasymm8_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor bool align_corners, const Window &window) { // Data layout is NHWC - const int idx_width = 1; - const int idx_height = 2; + const int idx_width = 1; + const int idx_height = 2; // Compute the ratio between source height and destination height const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), align_corners); @@ -205,4 +204,4 @@ void qasymm8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, } // namespace cpu } // namespace arm_compute -#endif // __ARM_FEATURE_SVE
\ No newline at end of file +#endif // defined(ENABLE_SVE)
\ No newline at end of file diff --git a/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp b/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp index 9df4301fe3..584ec7a0da 100644 --- a/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp +++ b/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp @@ -21,6 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#if defined(ENABLE_SVE) #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" @@ -31,12 +32,10 @@ #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" +#include <arm_sve.h> #include <cmath> #include <cstddef> -#if defined(__ARM_FEATURE_SVE) -#include <arm_sve.h> - namespace arm_compute { namespace @@ -90,8 +89,8 @@ void qasymm8_signed_sve_scale_bilinear(const ITensor *src, ITensor *dst, const I bool align_corners, const Window &window) { // Data layout is NHWC - const int idx_width = 1; - const int idx_height = 2; + const int idx_width = 1; + const int idx_height = 2; // Compute the ratio between source height and destination height const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), align_corners); @@ -205,4 +204,4 @@ void qasymm8_signed_sve_scale(const ITensor *src, ITensor *dst, const ITensor *o } // namespace cpu } // namespace arm_compute -#endif // __ARM_FEATURE_SVE
\ No newline at end of file +#endif // ENABLE_SVE
\ No newline at end of file diff --git a/src/core/cpu/kernels/softmax/impl/NEON/list.h b/src/core/cpu/kernels/softmax/impl/neon/list.h index 5ebee31272..5ebee31272 100644 --- a/src/core/cpu/kernels/softmax/impl/NEON/list.h +++ b/src/core/cpu/kernels/softmax/impl/neon/list.h diff --git a/src/core/cpu/kernels/softmax/impl/sve/impl.cpp b/src/core/cpu/kernels/softmax/impl/sve/impl.cpp new file mode 100644 index 0000000000..4ed5a4fbea --- /dev/null +++ b/src/core/cpu/kernels/softmax/impl/sve/impl.cpp @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType> +void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) +{ + const auto all_true_pg = wrapper::svptrue<ScalarType>(); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win{ window }; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator input(in, win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + // Get pointers + const auto in_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); + const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + + // Init max value + auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>()); + + int x = window_start_x; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto current_value = svld1(pg, in_ptr + x); + vec_max = svmax_m(pg, vec_max, current_value); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + + auto max_val = svmaxv(all_true_pg, vec_max); + + *out_ptr = max_val; + }, + input, output); +} + +template <typename ScalarType> +void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, + ITensor *out, const float beta, bool is_log, const Window &window) +{ + const int start_x = in->info()->valid_region().anchor.x(); + const int input_width = in->info()->valid_region().shape.x(); + + Iterator in_it(in, window); + Iterator max_it(max, window); + Iterator out_it(out, window); + + const auto all_true_pg = wrapper::svptrue<ScalarType>(); + + execute_window_loop(window, [&](const Coordinates &) + { + /* Get pointers */ + const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp); + + ScalarType sum{ 0 }; + + /* Compute exponentials and sum */ + { + /* Get max value */ + const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr()); + const auto vec_max = wrapper::svdup_n(max_val); + + /* Init sum to zero */ + auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0)); + + /* Loop over row and compute exponentials and sum */ + int x = 0; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); + do + { + auto vec_elements = svld1(pg, in_ptr + x); + vec_elements = svsub_z(pg, vec_elements, vec_max); + if(is_log) + { + vec_elements = svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast<ScalarType>(beta))); + vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements)); + } + else + { + vec_elements = wrapper::svexp_z(pg, svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast<ScalarType>(beta)))); + vec_sum = svadd_m(pg, vec_sum, vec_elements); + } + svst1(pg, tmp_ptr + x, vec_elements); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, input_width); + } + while(svptest_any(all_true_pg, pg)); + + /* Reduce sum */ + sum = svaddv(all_true_pg, vec_sum); + + if(is_log) + { + sum = static_cast<ScalarType>(std::log(sum)); + } + else + { + sum = ScalarType(1) / sum; + } + } + + /* Normalize exponentials */ + { + /* Loop over row and compute softmax */ + int x = 0; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); + do + { + auto vec_in = svld1(pg, tmp_ptr + x); + auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0)); + if(is_log) + { + normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum))); + } + else + { + normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum))); + } + svst1(pg, out_ptr + x, normalized_value); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, input_width); + } + while(svptest_any(all_true_pg, pg)); + } + }, + in_it, max_it, out_it); +} + +template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window); +template void sve_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, const Window &window); +template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window); +template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window); + +template void sve_softmax_logits_1d_float<float>(const ITensor *in, const ITensor *max, void *const tmp, + ITensor *out, const float beta, bool is_log, const Window &window); +template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in, const ITensor *max, void *const tmp, + ITensor *out, const float beta, bool is_log, const Window &window); +} // namespace cpu +} // namespace arm_compute +#endif /* defined(ENABLE_SVE) */ diff --git a/src/core/cpu/kernels/softmax/impl/SVE/list.h b/src/core/cpu/kernels/softmax/impl/sve/list.h index d558d7d193..7ddb358b8e 100644 --- a/src/core/cpu/kernels/softmax/impl/SVE/list.h +++ b/src/core/cpu/kernels/softmax/impl/sve/list.h @@ -24,7 +24,7 @@ #ifndef SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H #define SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" #include "src/core/NEON/SVEMath.h" @@ -36,44 +36,11 @@ namespace arm_compute namespace cpu { template <typename ScalarType> -void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) -{ - const auto all_true_pg = wrapper::svptrue<ScalarType>(); - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - Window win{ window }; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator input(in, win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - // Get pointers - const auto in_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); - const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - - // Init max value - auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>()); +void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); - int x = window_start_x; - svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - do - { - const auto current_value = svld1(pg, in_ptr + x); - vec_max = svmax_m(pg, vec_max, current_value); - - x += wrapper::svcnt<ScalarType>(); - pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - - auto max_val = svmaxv(all_true_pg, vec_max); - - *out_ptr = max_val; - }, - input, output); -} +template <typename ScalarType> +void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, + ITensor *out, const float beta, bool is_log, const Window &window); #if defined(__ARM_FEATURE_SVE2) template <typename ScalarType> @@ -249,105 +216,8 @@ void sve_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void in_it, max_it, out_it); } #endif /* defined(__ARM_FEATURE_SVE2) */ - -template <typename ScalarType> -void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) -{ - const int start_x = in->info()->valid_region().anchor.x(); - const int input_width = in->info()->valid_region().shape.x(); - - Iterator in_it(in, window); - Iterator max_it(max, window); - Iterator out_it(out, window); - - const auto all_true_pg = wrapper::svptrue<ScalarType>(); - - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp); - - ScalarType sum{ 0 }; - - /* Compute exponentials and sum */ - { - /* Get max value */ - const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr()); - const auto vec_max = wrapper::svdup_n(max_val); - - /* Init sum to zero */ - auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0)); - - /* Loop over row and compute exponentials and sum */ - int x = 0; - svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); - do - { - auto vec_elements = svld1(pg, in_ptr + x); - vec_elements = svsub_z(pg, vec_elements, vec_max); - if(is_log) - { - vec_elements = svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast<ScalarType>(beta))); - vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements)); - } - else - { - vec_elements = wrapper::svexp_z(pg, svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast<ScalarType>(beta)))); - vec_sum = svadd_m(pg, vec_sum, vec_elements); - } - svst1(pg, tmp_ptr + x, vec_elements); - - x += wrapper::svcnt<ScalarType>(); - pg = wrapper::svwhilelt<ScalarType>(x, input_width); - } - while(svptest_any(all_true_pg, pg)); - - /* Reduce sum */ - sum = svaddv(all_true_pg, vec_sum); - - if(is_log) - { - sum = static_cast<ScalarType>(std::log(sum)); - } - else - { - sum = ScalarType(1) / sum; - } - } - - /* Normalize exponentials */ - { - /* Loop over row and compute softmax */ - int x = 0; - svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); - do - { - auto vec_in = svld1(pg, tmp_ptr + x); - auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0)); - if(is_log) - { - normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum))); - } - else - { - normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum))); - } - svst1(pg, out_ptr + x, normalized_value); - - x += wrapper::svcnt<ScalarType>(); - pg = wrapper::svwhilelt<ScalarType>(x, input_width); - } - while(svptest_any(all_true_pg, pg)); - } - }, - in_it, max_it, out_it); -} - } // namespace cpu } // namespace arm_compute -#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ #endif /* SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H */ |