diff options
3 files changed, 85 insertions, 9 deletions
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h index 6f88d2757a..a11bf44458 100644 --- a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h +++ b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h @@ -50,9 +50,21 @@ public: /** Initialise the kernel's input, output and border mode. * - * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F32 - * @param[in] input2 An input tensor. Data types supported: U8, QS8 (only if @p input1 is QS8),QS16 (only if @p input1 is QS16), S16/F32 (only if @p input1 is F32). - * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F32 (only if both inputs are F32). + * Valid configurations (Input1,Input2) -> Output : + * + * - (U8,U8) -> U8 + * - (QS8,QS8) -> QS8 + * - (U8,U8) -> S16 + * - (S16,U8) -> S16 + * - (U8,S16) -> S16 + * - (S16,S16) -> S16 + * - (QS16,QS16) -> QS16 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * + * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 + * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. * @param[in] policy Overflow policy. */ void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy); @@ -63,9 +75,9 @@ public: private: /** Common signature for all the specialised sub functions * - * @param[in] input1 An input tensor. Data types supported: U8/S16/F32 - * @param[in] input2 An input tensor. Data types supported: U8/S16/F32 (only if @p input1 is F32). - * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F32 (only if both inputs are F32). + * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 + * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. * @param[in] window Region on which to execute the kernel. */ using SubFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window); diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp index cac2a6bd05..be8574317b 100644 --- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp +++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp @@ -157,6 +157,45 @@ void sub_saturate_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *o input1, input2, output); } +#ifdef ARM_COMPUTE_ENABLE_FP16 +inline float16x8x2_t vsub2q_f16(const float16x8x2_t &a, const float16x8x2_t &b) +{ + const float16x8x2_t res = + { + { + vsubq_f16(a.val[0], b.val[0]), + vsubq_f16(a.val[1], b.val[1]) + } + }; + + return res; +} +#endif /* ARM_COMPUTE_ENABLE_FP16 */ + +void sub_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ +#ifdef ARM_COMPUTE_ENABLE_FP16 + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const float16x8x2_t a = vld2q_f16(reinterpret_cast<const float16_t *>(input1.ptr())); + const float16x8x2_t b = vld2q_f16(reinterpret_cast<const float16_t *>(input2.ptr())); + + vst2q_f16(reinterpret_cast<float16_t *>(output.ptr()), vsub2q_f16(a, b)); + }, + input1, input2, output); +#else /* ARM_COMPUTE_ENABLE_FP16 */ + ARM_COMPUTE_UNUSED(in1); + ARM_COMPUTE_UNUSED(in2); + ARM_COMPUTE_UNUSED(out); + ARM_COMPUTE_UNUSED(window); + ARM_COMPUTE_ERROR("Not supported, recompile the library with arch=arm64-v8.2-a"); +#endif /* ARM_COMPUTE_ENABLE_FP16 */ +} + void sub_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { Iterator input1(in1, window); @@ -328,6 +367,10 @@ void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITens { set_format_if_unknown(*output->info(), Format::S16); } + else if(input1->info()->data_type() == DataType::F16 || input2->info()->data_type() == DataType::F16) + { + set_format_if_unknown(*output->info(), Format::F16); + } else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32) { set_format_if_unknown(*output->info(), Format::F32); @@ -335,9 +378,9 @@ void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITens } ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8), "Output can only be U8 if both inputs are U8"); if(is_data_type_fixed_point(input1->info()->data_type()) || is_data_type_fixed_point(input2->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type())) @@ -364,6 +407,9 @@ void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITens { "sub_saturate_S16_S16_S16", &sub_saturate_S16_S16_S16 }, { "sub_wrap_F32_F32_F32", &sub_F32_F32_F32 }, { "sub_saturate_F32_F32_F32", &sub_F32_F32_F32 }, + { "sub_wrap_F16_F16_F16", &sub_F16_F16_F16 }, + { "sub_saturate_F16_F16_F16", &sub_F16_F16_F16 }, + }; _input1 = input1; diff --git a/tests/validation/NEON/ArithmeticSubtraction.cpp b/tests/validation/NEON/ArithmeticSubtraction.cpp index 54cd9f04ba..23a320a84d 100644 --- a/tests/validation/NEON/ArithmeticSubtraction.cpp +++ b/tests/validation/NEON/ArithmeticSubtraction.cpp @@ -200,6 +200,7 @@ BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * ConvertPolicies() * boost::unit_t // Validate output validate(NEAccessor(dst), ref_dst); } + BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly")) BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * ConvertPolicies() * boost::unit_test::data::xrange(1, 7), shape, policy, fixed_point_position) @@ -245,6 +246,23 @@ BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * ConvertPolicies() * boost::unit_t BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END() +#ifdef ARM_COMPUTE_ENABLE_FP16 +BOOST_AUTO_TEST_SUITE(Float16) +BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) +BOOST_DATA_TEST_CASE(RunSmall, SmallShapes(), shape) +{ + // Compute function + Tensor dst = compute_arithmetic_subtraction(shape, DataType::F16, DataType::F16, DataType::F16, ConvertPolicy::WRAP); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_arithmetic_subtraction(shape, DataType::F16, DataType::F16, DataType::F16, ConvertPolicy::WRAP); + + // Validate output + validate(NEAccessor(dst), ref_dst); +} +BOOST_AUTO_TEST_SUITE_END() +#endif /* ARM_COMPUTE_ENABLE_FP16 */ + BOOST_AUTO_TEST_SUITE(Float) BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly")) BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP }), |