From 29e27b0544d99e5d98f044a9e606db8abcfb8900 Mon Sep 17 00:00:00 2001 From: Pablo Marquez Tello Date: Thu, 3 Aug 2023 14:47:31 +0100 Subject: Add support for S64 output in NEArgMinMaxLayer * NEArgMinMaxLayer uses NEReductionOperation to compute its result in S32 * We need to call NECast to convert from S32 to S64 * Resolves MLCE-1089 Change-Id: I6fded869b6076d7af1b9b3e70eb384f4ee82fd8a Signed-off-by: Pablo Marquez Tello Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10054 Tested-by: Arm Jenkins Reviewed-by: Viet-Hoa Do Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- src/cpu/kernels/CpuCastKernel.cpp | 48 ++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 6 deletions(-) (limited to 'src/cpu/kernels/CpuCastKernel.cpp') diff --git a/src/cpu/kernels/CpuCastKernel.cpp b/src/cpu/kernels/CpuCastKernel.cpp index 641dea40dc..d478328d07 100644 --- a/src/cpu/kernels/CpuCastKernel.cpp +++ b/src/cpu/kernels/CpuCastKernel.cpp @@ -103,15 +103,20 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16, DataType::F32, DataType::S32, DataType::S64, DataType::U64); -#else // __aarch64__ + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, + DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16, + DataType::U32, DataType::S32, DataType::F32, DataType::S64); + +#else // __aarch64__ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16, DataType::F32, DataType::S32); -#endif // __aarch64__ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16, DataType::U32, DataType::S32, DataType::F32); +#endif // __aarch64__ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32), @@ -146,13 +151,15 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::F16 - && dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8), - "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8"); + && dst->data_type() != DataType::F32 + && dst->data_type() != DataType::U8 + && dst->data_type() != DataType::S64), + "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8, S64"); #ifdef __aarch64__ - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S64 && dst->data_type() != DataType::F32, + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S64 && dst->data_type() != DataType::F32, "Only data_types supported [in] S64 -> [out] F32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U64 && dst->data_type() != DataType::F32, + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U64 && dst->data_type() != DataType::F32, "Only data_types supported [in] U64 -> [out] F32"); #endif // __aarch64__ @@ -198,6 +205,28 @@ inline void internal_neon_convert(const T1 *src_ptr, T2 *dst_ptr) ARM_COMPUTE_UNUSED(dst_ptr); } +template <> +inline void internal_neon_convert(const int32_t *src_ptr, int64_t *dst_ptr) +{ + const int32x4x4_t texels = + { + { + vld1q_s32(src_ptr), + vld1q_s32(src_ptr + 4), + vld1q_s32(src_ptr + 8), + vld1q_s32(src_ptr + 12) + } + }; + vst1q_s64(dst_ptr, vmovl_s32(vget_low_s32(texels.val[0]))); + vst1q_s64(dst_ptr + 2, vmovl_s32(vget_high_s32(texels.val[0]))); + vst1q_s64(dst_ptr + 4, vmovl_s32(vget_low_s32(texels.val[1]))); + vst1q_s64(dst_ptr + 6, vmovl_s32(vget_high_s32(texels.val[1]))); + vst1q_s64(dst_ptr + 8, vmovl_s32(vget_low_s32(texels.val[2]))); + vst1q_s64(dst_ptr + 10, vmovl_s32(vget_high_s32(texels.val[2]))); + vst1q_s64(dst_ptr + 12, vmovl_s32(vget_low_s32(texels.val[3]))); + vst1q_s64(dst_ptr + 14, vmovl_s32(vget_high_s32(texels.val[3]))); +} + template <> inline void internal_neon_convert(const int64_t *src_ptr, float *dst_ptr) { @@ -1062,6 +1091,13 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::S32: switch(_dst->info()->data_type()) { +#if __aarch64__ + case DataType::S64: + { + convert64(src, dst, win, window_start_x, window_end_x, window_step_x); + break; + } +#endif // __aarch64__ case DataType::F16: { /* Down-conversion S32 -> F16 */ -- cgit v1.2.1