diff options
author | Pablo Marquez Tello <pablo.tello@arm.com> | 2023-08-03 14:47:31 +0100 |
---|---|---|
committer | Pablo Marquez Tello <pablo.tello@arm.com> | 2023-08-08 15:49:54 +0000 |
commit | 29e27b0544d99e5d98f044a9e606db8abcfb8900 (patch) | |
tree | 3749d3f3640d55fceda4dcd04a2916c87414b045 /src | |
parent | 66b4a6a8ca1ee55e5b7f05bae2543cf99fe22d6d (diff) | |
download | ComputeLibrary-29e27b0544d99e5d98f044a9e606db8abcfb8900.tar.gz |
Add support for S64 output in NEArgMinMaxLayer
* NEArgMinMaxLayer uses NEReductionOperation to compute its result in S32
* We need to call NECast to convert from S32 to S64
* Resolves MLCE-1089
Change-Id: I6fded869b6076d7af1b9b3e70eb384f4ee82fd8a
Signed-off-by: Pablo Marquez Tello <pablo.tello@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10054
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/cpu/kernels/CpuCastKernel.cpp | 48 | ||||
-rw-r--r-- | src/cpu/kernels/CpuCastKernel.h | 4 | ||||
-rw-r--r-- | src/runtime/NEON/functions/NEArgMinMaxLayer.cpp | 44 |
3 files changed, 83 insertions, 13 deletions
diff --git a/src/cpu/kernels/CpuCastKernel.cpp b/src/cpu/kernels/CpuCastKernel.cpp index 641dea40dc..d478328d07 100644 --- a/src/cpu/kernels/CpuCastKernel.cpp +++ b/src/cpu/kernels/CpuCastKernel.cpp @@ -103,15 +103,20 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16, DataType::F32, DataType::S32, DataType::S64, DataType::U64); -#else // __aarch64__ + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, + DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16, + DataType::U32, DataType::S32, DataType::F32, DataType::S64); + +#else // __aarch64__ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16, DataType::F32, DataType::S32); -#endif // __aarch64__ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16, DataType::U32, DataType::S32, DataType::F32); +#endif // __aarch64__ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32), @@ -146,13 +151,15 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::F16 - && dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8), - "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8"); + && dst->data_type() != DataType::F32 + && dst->data_type() != DataType::U8 + && dst->data_type() != DataType::S64), + "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8, S64"); #ifdef __aarch64__ - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S64 && dst->data_type() != DataType::F32, + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S64 && dst->data_type() != DataType::F32, "Only data_types supported [in] S64 -> [out] F32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U64 && dst->data_type() != DataType::F32, + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U64 && dst->data_type() != DataType::F32, "Only data_types supported [in] U64 -> [out] F32"); #endif // __aarch64__ @@ -199,6 +206,28 @@ inline void internal_neon_convert(const T1 *src_ptr, T2 *dst_ptr) } template <> +inline void internal_neon_convert<int32_t, int64_t>(const int32_t *src_ptr, int64_t *dst_ptr) +{ + const int32x4x4_t texels = + { + { + vld1q_s32(src_ptr), + vld1q_s32(src_ptr + 4), + vld1q_s32(src_ptr + 8), + vld1q_s32(src_ptr + 12) + } + }; + vst1q_s64(dst_ptr, vmovl_s32(vget_low_s32(texels.val[0]))); + vst1q_s64(dst_ptr + 2, vmovl_s32(vget_high_s32(texels.val[0]))); + vst1q_s64(dst_ptr + 4, vmovl_s32(vget_low_s32(texels.val[1]))); + vst1q_s64(dst_ptr + 6, vmovl_s32(vget_high_s32(texels.val[1]))); + vst1q_s64(dst_ptr + 8, vmovl_s32(vget_low_s32(texels.val[2]))); + vst1q_s64(dst_ptr + 10, vmovl_s32(vget_high_s32(texels.val[2]))); + vst1q_s64(dst_ptr + 12, vmovl_s32(vget_low_s32(texels.val[3]))); + vst1q_s64(dst_ptr + 14, vmovl_s32(vget_high_s32(texels.val[3]))); +} + +template <> inline void internal_neon_convert<int64_t, float>(const int64_t *src_ptr, float *dst_ptr) { const float64x2x4_t texels0 = @@ -1062,6 +1091,13 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::S32: switch(_dst->info()->data_type()) { +#if __aarch64__ + case DataType::S64: + { + convert64<int32_t, int64_t>(src, dst, win, window_start_x, window_end_x, window_step_x); + break; + } +#endif // __aarch64__ case DataType::F16: { /* Down-conversion S32 -> F16 */ diff --git a/src/cpu/kernels/CpuCastKernel.h b/src/cpu/kernels/CpuCastKernel.h index 76237368d8..d8e61e6011 100644 --- a/src/cpu/kernels/CpuCastKernel.h +++ b/src/cpu/kernels/CpuCastKernel.h @@ -61,9 +61,11 @@ public: * - F32 -> QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8 * * @param[in] src The src tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/S32/S64/BFLOAT16/F16/F32. - * @param[out] dst The dst tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/BFLOAT16/F16/F32. + * @param[out] dst The dst tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/S64/BFLOAT16/F16/F32. * @param[in] policy Conversion policy. * + * @note S64 is only supported in aarch64 + * * @deprecated Support for BFLOAT16 will be removed in 23.05 release */ void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp index 3876ae6e87..3ac127b02e 100644 --- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp +++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,22 +29,49 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/functions/NECast.h" +#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" +#include "arm_compute/runtime/Tensor.h" #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEReductionOperationKernel.h" namespace arm_compute { +struct NEArgMinMaxLayer::Impl +{ + MemoryGroup memory_group{}; + std::shared_ptr<IMemoryManager> memory_manager{}; + std::unique_ptr<NEReductionOperation> reduction_function{}; + std::unique_ptr<NECast> cast_function{}; + std::unique_ptr<Tensor> tmp_reduction_result{}; +}; + NEArgMinMaxLayer::~NEArgMinMaxLayer() = default; NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _reduction_function(std::make_unique<NEReductionOperation>()) + : _impl(std::make_unique<Impl>()) { - ARM_COMPUTE_UNUSED(memory_manager); + _impl->memory_manager = std::move(memory_manager); } + void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, const ReductionOperation &op) { ARM_COMPUTE_LOG_PARAMS(input, axis, output, op); - _reduction_function->configure(input, output, axis, op, false); + _impl->reduction_function = std::make_unique<NEReductionOperation>(); + if(output->info() && (output->info()->data_type() == DataType::S64 || output->info()->data_type() == DataType::U64)) + { + _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager)); + _impl->cast_function = std::make_unique<NECast>(); + _impl->tmp_reduction_result = std::make_unique<Tensor>(); + _impl->reduction_function->configure(input, _impl->tmp_reduction_result.get(), axis, op, false); + _impl->cast_function->configure(_impl->tmp_reduction_result.get(), output, ConvertPolicy::SATURATE); + _impl->memory_group.manage(_impl->tmp_reduction_result.get()); + _impl->tmp_reduction_result->allocator()->allocate(); + } + else + { + _impl->reduction_function->configure(input, output, axis, op, false); + } } Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) @@ -55,7 +82,12 @@ Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITen void NEArgMinMaxLayer::run() { - _reduction_function->run(); + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->reduction_function->run(); + if(_impl->tmp_reduction_result != nullptr) + { + _impl->cast_function->run(); + } } -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute |