From 4a1c91767142f76e92bf4575564d7e54fcd0ebf4 Mon Sep 17 00:00:00 2001 From: Pablo Marquez Tello Date: Tue, 18 Jul 2023 14:51:24 +0100 Subject: Add support for input S64/U64 in CpuCastKernel * The kernel now supports the following conversions: S64 -> F32 U64 -> F32 * Resolves MLCE-1089 Change-Id: I277cf58b78d919fde25947520d2056e1412c7f82 Signed-off-by: Pablo Marquez Tello Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9935 Tested-by: Arm Jenkins Reviewed-by: Viet-Hoa Do Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- Android.bp | 3 +- arm_compute/core/Types.h | 2 + src/core/Utils.cpp | 12 +- src/cpu/kernels/CpuCastKernel.cpp | 153 ++++++++++++++++++++++- src/cpu/kernels/CpuCastKernel.h | 3 +- src/cpu/operators/CpuCast.h | 3 +- tests/SimpleTensor.h | 2 + tests/validation/NEON/Cast.cpp | 19 +++ tests/validation/reference/DepthConvertLayer.cpp | 2 + 9 files changed, 193 insertions(+), 6 deletions(-) diff --git a/Android.bp b/Android.bp index 2e1247a162..b7936e5671 100644 --- a/Android.bp +++ b/Android.bp @@ -167,7 +167,8 @@ arm_compute_library_defaults { "-Wno-unused-parameter", "-DNO_DOT_IN_TOOLCHAIN", "-Wno-implicit-fallthrough", - "-fPIC" + "-fPIC", + "-DACL_INTERNAL_TEST_CKW_IN_DF" ], rtti: true, } diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index d9c89f8779..a69177ed80 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -81,6 +81,8 @@ enum class Format U16, /**< 1 channel, 1 U16 per channel */ S32, /**< 1 channel, 1 S32 per channel */ U32, /**< 1 channel, 1 U32 per channel */ + S64, /**< 1 channel, 1 S64 per channel */ + U64, /**< 1 channel, 1 U64 per channel */ BFLOAT16, /**< 16-bit brain floating-point number */ F16, /**< 1 channel, 1 F16 per channel */ F32, /**< 1 channel, 1 F32 per channel */ diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp index ef7186aad1..89f373fc87 100644 --- a/src/core/Utils.cpp +++ b/src/core/Utils.cpp @@ -414,7 +414,7 @@ QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool return QuantizationInfo(1.f / 256, 0); } -std::pair get_quantized_activation_min_max(const ActivationLayerInfo& act_info, DataType data_type, UniformQuantizationInfo oq_info) +std::pair get_quantized_activation_min_max(const ActivationLayerInfo &act_info, DataType data_type, UniformQuantizationInfo oq_info) { const bool is_qasymm8_signed = is_data_type_quantized_asymmetric_signed(data_type); const auto a = act_info.a(); @@ -497,6 +497,12 @@ void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr case DataType::S32: print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, element_delim); break; + case DataType::U64: + print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, element_delim); + break; + case DataType::S64: + print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, element_delim); + break; case DataType::BFLOAT16: print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, element_delim); break; @@ -533,6 +539,10 @@ int max_consecutive_elements_display_width(std::ostream &s, DataType dt, const u return max_consecutive_elements_display_width_impl(s, reinterpret_cast(ptr), n); case DataType::S32: return max_consecutive_elements_display_width_impl(s, reinterpret_cast(ptr), n); + case DataType::U64: + return max_consecutive_elements_display_width_impl(s, reinterpret_cast(ptr), n); + case DataType::S64: + return max_consecutive_elements_display_width_impl(s, reinterpret_cast(ptr), n); case DataType::BFLOAT16: return max_consecutive_elements_display_width_impl(s, reinterpret_cast(ptr), n); case DataType::F16: diff --git a/src/cpu/kernels/CpuCastKernel.cpp b/src/cpu/kernels/CpuCastKernel.cpp index 15a9ddcab4..641dea40dc 100644 --- a/src/cpu/kernels/CpuCastKernel.cpp +++ b/src/cpu/kernels/CpuCastKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2022 Arm Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -99,9 +99,16 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(dst); ARM_COMPUTE_UNUSED(policy); ARM_COMPUTE_RETURN_ERROR_ON(src == dst); +#ifdef __aarch64__ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, + DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16, + DataType::F32, DataType::S32, DataType::S64, DataType::U64); +#else // __aarch64__ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16, DataType::F32, DataType::S32); +#endif // __aarch64__ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16, DataType::U32, DataType::S32, DataType::F32); @@ -141,6 +148,13 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8), "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8"); +#ifdef __aarch64__ + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S64 && dst->data_type() != DataType::F32, + "Only data_types supported [in] S64 -> [out] F32"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U64 && dst->data_type() != DataType::F32, + "Only data_types supported [in] U64 -> [out] F32"); +#endif // __aarch64__ // Validate in case of configured dst if(dst->total_size() > 0) @@ -174,6 +188,111 @@ Status CpuCastKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, C ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, policy)); return Status{}; } +#ifdef __aarch64__ +namespace +{ +template +inline void internal_neon_convert(const T1 *src_ptr, T2 *dst_ptr) +{ + ARM_COMPUTE_UNUSED(src_ptr); + ARM_COMPUTE_UNUSED(dst_ptr); +} + +template <> +inline void internal_neon_convert(const int64_t *src_ptr, float *dst_ptr) +{ + const float64x2x4_t texels0 = + { + { + vcvtq_f64_s64(vld1q_s64(src_ptr)), + vcvtq_f64_s64(vld1q_s64(src_ptr + 2)), + vcvtq_f64_s64(vld1q_s64(src_ptr + 4)), + vcvtq_f64_s64(vld1q_s64(src_ptr + 6)) + } + }; + const float64x2x4_t texels1 = + { + { + vcvtq_f64_s64(vld1q_s64(src_ptr + 8)), + vcvtq_f64_s64(vld1q_s64(src_ptr + 10)), + vcvtq_f64_s64(vld1q_s64(src_ptr + 12)), + vcvtq_f64_s64(vld1q_s64(src_ptr + 14)) + } + }; + const float32x4x4_t texels = + { + { + vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])), + vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])), + vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])), + vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3])) + } + }; + vst1q_f32(dst_ptr, texels.val[0]); + vst1q_f32(dst_ptr + 4, texels.val[1]); + vst1q_f32(dst_ptr + 8, texels.val[2]); + vst1q_f32(dst_ptr + 12, texels.val[3]); +} + +template <> +inline void internal_neon_convert(const uint64_t *src_ptr, float *dst_ptr) +{ + const float64x2x4_t texels0 = + { + { + vcvtq_f64_u64(vld1q_u64(src_ptr)), + vcvtq_f64_u64(vld1q_u64(src_ptr + 2)), + vcvtq_f64_u64(vld1q_u64(src_ptr + 4)), + vcvtq_f64_u64(vld1q_u64(src_ptr + 6)) + } + }; + const float64x2x4_t texels1 = + { + { + vcvtq_f64_u64(vld1q_u64(src_ptr + 8)), + vcvtq_f64_u64(vld1q_u64(src_ptr + 10)), + vcvtq_f64_u64(vld1q_u64(src_ptr + 12)), + vcvtq_f64_u64(vld1q_u64(src_ptr + 14)) + } + }; + + const float32x4x4_t texels = + { + { + vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])), + vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])), + vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])), + vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3])) + } + }; + + vst1q_f32(dst_ptr, texels.val[0]); + vst1q_f32(dst_ptr + 4, texels.val[1]); + vst1q_f32(dst_ptr + 8, texels.val[2]); + vst1q_f32(dst_ptr + 12, texels.val[3]); +} + +template +inline void convert64(Iterator &src, Iterator &dst, const Window &win, int window_start_x, int window_end_x, int window_step_x) +{ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast(src.ptr()); + const auto dst_ptr = reinterpret_cast(dst.ptr()); + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + internal_neon_convert(src_ptr + x, dst_ptr + x); + } + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast(*(src_ptr + x)); + } + }, + src, dst); +} +} // namespace +#endif // __aarch64__ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { @@ -203,6 +322,37 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr switch(_src->info()->data_type()) { +#ifdef __aarch64__ + case DataType::U64: + { + switch(_dst->info()->data_type()) + { + case DataType::F32: + { + convert64(src, dst, win, window_start_x, window_end_x, window_step_x); + break; + } + default: + ARM_COMPUTE_ERROR("dst data type not supported"); + } + break; + } + case DataType::S64: + { + switch(_dst->info()->data_type()) + { + case DataType::F32: + { + convert64(src, dst, win, window_start_x, window_end_x, window_step_x); + break; + } + default: + ARM_COMPUTE_ERROR("dst data type not supported"); + } + break; + } +#endif // __aarch64__ + case DataType::QASYMM8_SIGNED: { switch(_dst->info()->data_type()) @@ -909,7 +1059,6 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr ARM_COMPUTE_ERROR("dst data type not supported"); } break; - case DataType::S32: switch(_dst->info()->data_type()) { diff --git a/src/cpu/kernels/CpuCastKernel.h b/src/cpu/kernels/CpuCastKernel.h index de4ace2140..76237368d8 100644 --- a/src/cpu/kernels/CpuCastKernel.h +++ b/src/cpu/kernels/CpuCastKernel.h @@ -57,9 +57,10 @@ public: * - BFLOAT16 -> F32 * - F16 -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8 * - S32 -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8 + * - S64 -> F32 * - F32 -> QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8 * - * @param[in] src The src tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/BFLOAT16/F16/F32. + * @param[in] src The src tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/S32/S64/BFLOAT16/F16/F32. * @param[out] dst The dst tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/BFLOAT16/F16/F32. * @param[in] policy Conversion policy. * diff --git a/src/cpu/operators/CpuCast.h b/src/cpu/operators/CpuCast.h index a8342581cb..356b033dbd 100644 --- a/src/cpu/operators/CpuCast.h +++ b/src/cpu/operators/CpuCast.h @@ -52,8 +52,9 @@ public: * |F16 | QASYMM8_SIGNED, QASYMM8, F32, S32, U8 | * |S32 | QASYMM8_SIGNED, QASYMM8, F16, F32, U8 | * |F32 | QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8| + * |S64 | F32 | * - * @param[in] src The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. + * @param[in] src The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/S64/F16/F32. * @param[out] dst The destination tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. * @param[in] policy Conversion policy. * diff --git a/tests/SimpleTensor.h b/tests/SimpleTensor.h index 9ea171d492..419621e808 100644 --- a/tests/SimpleTensor.h +++ b/tests/SimpleTensor.h @@ -392,6 +392,8 @@ int SimpleTensor::num_channels() const case Format::S16: case Format::U32: case Format::S32: + case Format::U64: + case Format::S64: case Format::F16: case Format::F32: return 1; diff --git a/tests/validation/NEON/Cast.cpp b/tests/validation/NEON/Cast.cpp index 166847ed66..a1ddcc9cad 100644 --- a/tests/validation/NEON/Cast.cpp +++ b/tests/validation/NEON/Cast.cpp @@ -101,6 +101,11 @@ const auto CastF32toS32Dataset = combine(framework::dataset::make("Da const auto CastF32toQASYMM8Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::QASYMM8)); const auto CastF32toQASYMM8_SIGNEDDataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)); +// U64 +const auto CastU64toF32Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::F32)); + +// S64 +const auto CastS64toF32Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::F32)); } // namespace TEST_SUITE(NEON) @@ -108,6 +113,8 @@ TEST_SUITE(Cast) template using NECastToU8Fixture = CastValidationFixture; template +using NECastToS8Fixture = CastValidationFixture; +template using NECastToU16Fixture = CastValidationFixture; template using NECastToS16Fixture = CastValidationFixture; @@ -116,6 +123,10 @@ using NECastToU32Fixture = CastValidationFixture using NECastToS32Fixture = CastValidationFixture; template +using NECastToU64Fixture = CastValidationFixture; +template +using NECastToS64Fixture = CastValidationFixture; +template using NECastToF16Fixture = CastValidationFixture; template using NECastToF32Fixture = CastValidationFixture; @@ -189,6 +200,14 @@ CAST_SUITE(F32_to_F16, DataType::F32, DataType::F16, NECastToF16Fixture, CAST_SUITE(F32_to_S32, DataType::F32, DataType::S32, NECastToS32Fixture, CastF32toS32Dataset, one_tolerance) CAST_SUITE(F32_to_U8, DataType::F32, DataType::S32, NECastToS32Fixture, CastF32toS32Dataset, one_tolerance) +#ifdef __aarch64__ +// S64 +CAST_SUITE(S64_to_F32, DataType::S64, DataType::F32, NECastToF32Fixture, CastS64toF32Dataset, zero_tolerance) + +// U64 +CAST_SUITE(U64_to_F32, DataType::U64, DataType::F32, NECastToF32Fixture, CastU64toF32Dataset, zero_tolerance) +#endif // __aarch64__ + DATA_TEST_CASE(KernelSelectionDstFP16, framework::DatasetMode::ALL, combine(framework::dataset::make("CpuExt", std::string("NEON")), framework::dataset::make("DataType", diff --git a/tests/validation/reference/DepthConvertLayer.cpp b/tests/validation/reference/DepthConvertLayer.cpp index 8797722f00..1e4939129e 100644 --- a/tests/validation/reference/DepthConvertLayer.cpp +++ b/tests/validation/reference/DepthConvertLayer.cpp @@ -194,6 +194,7 @@ template SimpleTensor depth_convert(const SimpleTensor &src, D template SimpleTensor depth_convert(const SimpleTensor &src, DataType dt_out, ConvertPolicy policy, uint32_t shift); template SimpleTensor depth_convert(const SimpleTensor &src, DataType dt_out, ConvertPolicy policy, uint32_t shift); template SimpleTensor depth_convert(const SimpleTensor &src, DataType dt_out, ConvertPolicy policy, uint32_t shift); +template SimpleTensor depth_convert(const SimpleTensor &src, DataType dt_out, ConvertPolicy policy, uint32_t shift); // U64 template SimpleTensor depth_convert(const SimpleTensor &src, DataType dt_out, ConvertPolicy policy, uint32_t shift); @@ -203,6 +204,7 @@ template SimpleTensor depth_convert(const SimpleTensor &src, template SimpleTensor depth_convert(const SimpleTensor &src, DataType dt_out, ConvertPolicy policy, uint32_t shift); template SimpleTensor depth_convert(const SimpleTensor &src, DataType dt_out, ConvertPolicy policy, uint32_t shift); template SimpleTensor depth_convert(const SimpleTensor &src, DataType dt_out, ConvertPolicy policy, uint32_t shift); +template SimpleTensor depth_convert(const SimpleTensor &src, DataType dt_out, ConvertPolicy policy, uint32_t shift); } // namespace reference } // namespace validation } // namespace test -- cgit v1.2.1