aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPablo Marquez Tello <pablo.tello@arm.com>2023-07-18 14:51:24 +0100
committerPablo Marquez Tello <pablo.tello@arm.com>2023-07-19 15:21:03 +0000
commit4a1c91767142f76e92bf4575564d7e54fcd0ebf4 (patch)
tree9ff60356ad77b07c3704826c14eb316a9eb9ac27
parent314d3e2c691734ff942c9a1aca1120b826a267d9 (diff)
downloadComputeLibrary-4a1c91767142f76e92bf4575564d7e54fcd0ebf4.tar.gz
Add support for input S64/U64 in CpuCastKernel
* The kernel now supports the following conversions: S64 -> F32 U64 -> F32 * Resolves MLCE-1089 Change-Id: I277cf58b78d919fde25947520d2056e1412c7f82 Signed-off-by: Pablo Marquez Tello <pablo.tello@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9935 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--Android.bp3
-rw-r--r--arm_compute/core/Types.h2
-rw-r--r--src/core/Utils.cpp12
-rw-r--r--src/cpu/kernels/CpuCastKernel.cpp153
-rw-r--r--src/cpu/kernels/CpuCastKernel.h3
-rw-r--r--src/cpu/operators/CpuCast.h3
-rw-r--r--tests/SimpleTensor.h2
-rw-r--r--tests/validation/NEON/Cast.cpp19
-rw-r--r--tests/validation/reference/DepthConvertLayer.cpp2
9 files changed, 193 insertions, 6 deletions
diff --git a/Android.bp b/Android.bp
index 2e1247a162..b7936e5671 100644
--- a/Android.bp
+++ b/Android.bp
@@ -167,7 +167,8 @@ arm_compute_library_defaults {
"-Wno-unused-parameter",
"-DNO_DOT_IN_TOOLCHAIN",
"-Wno-implicit-fallthrough",
- "-fPIC"
+ "-fPIC",
+ "-DACL_INTERNAL_TEST_CKW_IN_DF"
],
rtti: true,
}
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index d9c89f8779..a69177ed80 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -81,6 +81,8 @@ enum class Format
U16, /**< 1 channel, 1 U16 per channel */
S32, /**< 1 channel, 1 S32 per channel */
U32, /**< 1 channel, 1 U32 per channel */
+ S64, /**< 1 channel, 1 S64 per channel */
+ U64, /**< 1 channel, 1 U64 per channel */
BFLOAT16, /**< 16-bit brain floating-point number */
F16, /**< 1 channel, 1 F16 per channel */
F32, /**< 1 channel, 1 F32 per channel */
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index ef7186aad1..89f373fc87 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -414,7 +414,7 @@ QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool
return QuantizationInfo(1.f / 256, 0);
}
-std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo& act_info, DataType data_type, UniformQuantizationInfo oq_info)
+std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo &act_info, DataType data_type, UniformQuantizationInfo oq_info)
{
const bool is_qasymm8_signed = is_data_type_quantized_asymmetric_signed(data_type);
const auto a = act_info.a();
@@ -497,6 +497,12 @@ void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr
case DataType::S32:
print_consecutive_elements_impl<int32_t>(s, reinterpret_cast<const int32_t *>(ptr), n, stream_width, element_delim);
break;
+ case DataType::U64:
+ print_consecutive_elements_impl<uint64_t>(s, reinterpret_cast<const uint64_t *>(ptr), n, stream_width, element_delim);
+ break;
+ case DataType::S64:
+ print_consecutive_elements_impl<int64_t>(s, reinterpret_cast<const int64_t *>(ptr), n, stream_width, element_delim);
+ break;
case DataType::BFLOAT16:
print_consecutive_elements_impl<bfloat16>(s, reinterpret_cast<const bfloat16 *>(ptr), n, stream_width, element_delim);
break;
@@ -533,6 +539,10 @@ int max_consecutive_elements_display_width(std::ostream &s, DataType dt, const u
return max_consecutive_elements_display_width_impl<uint32_t>(s, reinterpret_cast<const uint32_t *>(ptr), n);
case DataType::S32:
return max_consecutive_elements_display_width_impl<int32_t>(s, reinterpret_cast<const int32_t *>(ptr), n);
+ case DataType::U64:
+ return max_consecutive_elements_display_width_impl<uint64_t>(s, reinterpret_cast<const uint64_t *>(ptr), n);
+ case DataType::S64:
+ return max_consecutive_elements_display_width_impl<int64_t>(s, reinterpret_cast<const int64_t *>(ptr), n);
case DataType::BFLOAT16:
return max_consecutive_elements_display_width_impl<bfloat16>(s, reinterpret_cast<const bfloat16 *>(ptr), n);
case DataType::F16:
diff --git a/src/cpu/kernels/CpuCastKernel.cpp b/src/cpu/kernels/CpuCastKernel.cpp
index 15a9ddcab4..641dea40dc 100644
--- a/src/cpu/kernels/CpuCastKernel.cpp
+++ b/src/cpu/kernels/CpuCastKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2022 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -99,9 +99,16 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver
ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(dst);
ARM_COMPUTE_UNUSED(policy);
ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
+#ifdef __aarch64__
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
+ DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
+ DataType::F32, DataType::S32, DataType::S64, DataType::U64);
+#else // __aarch64__
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
DataType::F32, DataType::S32);
+#endif // __aarch64__
+
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
DataType::U32, DataType::S32, DataType::F32);
@@ -141,6 +148,13 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver
&& dst->data_type() != DataType::F16
&& dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8),
"Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8");
+#ifdef __aarch64__
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S64 && dst->data_type() != DataType::F32,
+ "Only data_types supported [in] S64 -> [out] F32");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U64 && dst->data_type() != DataType::F32,
+ "Only data_types supported [in] U64 -> [out] F32");
+#endif // __aarch64__
// Validate in case of configured dst
if(dst->total_size() > 0)
@@ -174,6 +188,111 @@ Status CpuCastKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, C
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, policy));
return Status{};
}
+#ifdef __aarch64__
+namespace
+{
+template <typename T1, typename T2>
+inline void internal_neon_convert(const T1 *src_ptr, T2 *dst_ptr)
+{
+ ARM_COMPUTE_UNUSED(src_ptr);
+ ARM_COMPUTE_UNUSED(dst_ptr);
+}
+
+template <>
+inline void internal_neon_convert<int64_t, float>(const int64_t *src_ptr, float *dst_ptr)
+{
+ const float64x2x4_t texels0 =
+ {
+ {
+ vcvtq_f64_s64(vld1q_s64(src_ptr)),
+ vcvtq_f64_s64(vld1q_s64(src_ptr + 2)),
+ vcvtq_f64_s64(vld1q_s64(src_ptr + 4)),
+ vcvtq_f64_s64(vld1q_s64(src_ptr + 6))
+ }
+ };
+ const float64x2x4_t texels1 =
+ {
+ {
+ vcvtq_f64_s64(vld1q_s64(src_ptr + 8)),
+ vcvtq_f64_s64(vld1q_s64(src_ptr + 10)),
+ vcvtq_f64_s64(vld1q_s64(src_ptr + 12)),
+ vcvtq_f64_s64(vld1q_s64(src_ptr + 14))
+ }
+ };
+ const float32x4x4_t texels =
+ {
+ {
+ vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
+ vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
+ vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
+ vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))
+ }
+ };
+ vst1q_f32(dst_ptr, texels.val[0]);
+ vst1q_f32(dst_ptr + 4, texels.val[1]);
+ vst1q_f32(dst_ptr + 8, texels.val[2]);
+ vst1q_f32(dst_ptr + 12, texels.val[3]);
+}
+
+template <>
+inline void internal_neon_convert<uint64_t, float>(const uint64_t *src_ptr, float *dst_ptr)
+{
+ const float64x2x4_t texels0 =
+ {
+ {
+ vcvtq_f64_u64(vld1q_u64(src_ptr)),
+ vcvtq_f64_u64(vld1q_u64(src_ptr + 2)),
+ vcvtq_f64_u64(vld1q_u64(src_ptr + 4)),
+ vcvtq_f64_u64(vld1q_u64(src_ptr + 6))
+ }
+ };
+ const float64x2x4_t texels1 =
+ {
+ {
+ vcvtq_f64_u64(vld1q_u64(src_ptr + 8)),
+ vcvtq_f64_u64(vld1q_u64(src_ptr + 10)),
+ vcvtq_f64_u64(vld1q_u64(src_ptr + 12)),
+ vcvtq_f64_u64(vld1q_u64(src_ptr + 14))
+ }
+ };
+
+ const float32x4x4_t texels =
+ {
+ {
+ vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
+ vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
+ vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
+ vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))
+ }
+ };
+
+ vst1q_f32(dst_ptr, texels.val[0]);
+ vst1q_f32(dst_ptr + 4, texels.val[1]);
+ vst1q_f32(dst_ptr + 8, texels.val[2]);
+ vst1q_f32(dst_ptr + 12, texels.val[3]);
+}
+
+template <typename T1, typename T2>
+inline void convert64(Iterator &src, Iterator &dst, const Window &win, int window_start_x, int window_end_x, int window_step_x)
+{
+ execute_window_loop(win, [&](const Coordinates &)
+ {
+ const auto src_ptr = reinterpret_cast<const T1 *>(src.ptr());
+ const auto dst_ptr = reinterpret_cast<T2 *>(dst.ptr());
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ internal_neon_convert<T1, T2>(src_ptr + x, dst_ptr + x);
+ }
+ for(; x < window_end_x; ++x)
+ {
+ *(dst_ptr + x) = static_cast<T2>(*(src_ptr + x));
+ }
+ },
+ src, dst);
+}
+} // namespace
+#endif // __aarch64__
void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
{
@@ -203,6 +322,37 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
switch(_src->info()->data_type())
{
+#ifdef __aarch64__
+ case DataType::U64:
+ {
+ switch(_dst->info()->data_type())
+ {
+ case DataType::F32:
+ {
+ convert64<uint64_t, float>(src, dst, win, window_start_x, window_end_x, window_step_x);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("dst data type not supported");
+ }
+ break;
+ }
+ case DataType::S64:
+ {
+ switch(_dst->info()->data_type())
+ {
+ case DataType::F32:
+ {
+ convert64<int64_t, float>(src, dst, win, window_start_x, window_end_x, window_step_x);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("dst data type not supported");
+ }
+ break;
+ }
+#endif // __aarch64__
+
case DataType::QASYMM8_SIGNED:
{
switch(_dst->info()->data_type())
@@ -909,7 +1059,6 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
ARM_COMPUTE_ERROR("dst data type not supported");
}
break;
-
case DataType::S32:
switch(_dst->info()->data_type())
{
diff --git a/src/cpu/kernels/CpuCastKernel.h b/src/cpu/kernels/CpuCastKernel.h
index de4ace2140..76237368d8 100644
--- a/src/cpu/kernels/CpuCastKernel.h
+++ b/src/cpu/kernels/CpuCastKernel.h
@@ -57,9 +57,10 @@ public:
* - BFLOAT16 -> F32
* - F16 -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8
* - S32 -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8
+ * - S64 -> F32
* - F32 -> QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8
*
- * @param[in] src The src tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/BFLOAT16/F16/F32.
+ * @param[in] src The src tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/S32/S64/BFLOAT16/F16/F32.
* @param[out] dst The dst tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/BFLOAT16/F16/F32.
* @param[in] policy Conversion policy.
*
diff --git a/src/cpu/operators/CpuCast.h b/src/cpu/operators/CpuCast.h
index a8342581cb..356b033dbd 100644
--- a/src/cpu/operators/CpuCast.h
+++ b/src/cpu/operators/CpuCast.h
@@ -52,8 +52,9 @@ public:
* |F16 | QASYMM8_SIGNED, QASYMM8, F32, S32, U8 |
* |S32 | QASYMM8_SIGNED, QASYMM8, F16, F32, U8 |
* |F32 | QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8|
+ * |S64 | F32 |
*
- * @param[in] src The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+ * @param[in] src The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/S64/F16/F32.
* @param[out] dst The destination tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
* @param[in] policy Conversion policy.
*
diff --git a/tests/SimpleTensor.h b/tests/SimpleTensor.h
index 9ea171d492..419621e808 100644
--- a/tests/SimpleTensor.h
+++ b/tests/SimpleTensor.h
@@ -392,6 +392,8 @@ int SimpleTensor<T>::num_channels() const
case Format::S16:
case Format::U32:
case Format::S32:
+ case Format::U64:
+ case Format::S64:
case Format::F16:
case Format::F32:
return 1;
diff --git a/tests/validation/NEON/Cast.cpp b/tests/validation/NEON/Cast.cpp
index 166847ed66..a1ddcc9cad 100644
--- a/tests/validation/NEON/Cast.cpp
+++ b/tests/validation/NEON/Cast.cpp
@@ -101,6 +101,11 @@ const auto CastF32toS32Dataset = combine(framework::dataset::make("Da
const auto CastF32toQASYMM8Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::QASYMM8));
const auto CastF32toQASYMM8_SIGNEDDataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
+// U64
+const auto CastU64toF32Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::F32));
+
+// S64
+const auto CastS64toF32Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::F32));
} // namespace
TEST_SUITE(NEON)
@@ -108,6 +113,8 @@ TEST_SUITE(Cast)
template <typename T>
using NECastToU8Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, uint8_t>;
template <typename T>
+using NECastToS8Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, int8_t>;
+template <typename T>
using NECastToU16Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, uint16_t>;
template <typename T>
using NECastToS16Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, int16_t>;
@@ -116,6 +123,10 @@ using NECastToU32Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, ui
template <typename T>
using NECastToS32Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, int32_t>;
template <typename T>
+using NECastToU64Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, uint64_t>;
+template <typename T>
+using NECastToS64Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, int64_t>;
+template <typename T>
using NECastToF16Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, half>;
template <typename T>
using NECastToF32Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, float>;
@@ -189,6 +200,14 @@ CAST_SUITE(F32_to_F16, DataType::F32, DataType::F16, NECastToF16Fixture<float>,
CAST_SUITE(F32_to_S32, DataType::F32, DataType::S32, NECastToS32Fixture<float>, CastF32toS32Dataset, one_tolerance)
CAST_SUITE(F32_to_U8, DataType::F32, DataType::S32, NECastToS32Fixture<float>, CastF32toS32Dataset, one_tolerance)
+#ifdef __aarch64__
+// S64
+CAST_SUITE(S64_to_F32, DataType::S64, DataType::F32, NECastToF32Fixture<int64_t>, CastS64toF32Dataset, zero_tolerance)
+
+// U64
+CAST_SUITE(U64_to_F32, DataType::U64, DataType::F32, NECastToF32Fixture<uint64_t>, CastU64toF32Dataset, zero_tolerance)
+#endif // __aarch64__
+
DATA_TEST_CASE(KernelSelectionDstFP16, framework::DatasetMode::ALL,
combine(framework::dataset::make("CpuExt", std::string("NEON")),
framework::dataset::make("DataType",
diff --git a/tests/validation/reference/DepthConvertLayer.cpp b/tests/validation/reference/DepthConvertLayer.cpp
index 8797722f00..1e4939129e 100644
--- a/tests/validation/reference/DepthConvertLayer.cpp
+++ b/tests/validation/reference/DepthConvertLayer.cpp
@@ -194,6 +194,7 @@ template SimpleTensor<int16_t> depth_convert(const SimpleTensor<int64_t> &src, D
template SimpleTensor<uint32_t> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
template SimpleTensor<int32_t> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
template SimpleTensor<half> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<float> depth_convert(const SimpleTensor<int64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
// U64
template SimpleTensor<uint8_t> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
@@ -203,6 +204,7 @@ template SimpleTensor<int16_t> depth_convert(const SimpleTensor<uint64_t> &src,
template SimpleTensor<uint32_t> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
template SimpleTensor<int32_t> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
template SimpleTensor<half> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
+template SimpleTensor<float> depth_convert(const SimpleTensor<uint64_t> &src, DataType dt_out, ConvertPolicy policy, uint32_t shift);
} // namespace reference
} // namespace validation
} // namespace test