From 81f0d15d6840a0ae8ef571114555a26da74c4a43 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Tue, 11 Jul 2017 15:00:52 +0100 Subject: COMPMID-444: Add support for QS8/QS16 NEON Arithmetic Add/Sub/Mul. Change-Id: Ia482498688ca1884272b5062e3415e736e03d36f Reviewed-on: http://mpd-gerrit.cambridge.arm.com/80448 Reviewed-by: Georgios Pinitas Tested-by: Kaizen --- .../NEON/kernels/NEArithmeticAdditionKernel.cpp | 48 +++++++++++++++- .../NEON/kernels/NEArithmeticSubtractionKernel.cpp | 48 +++++++++++++++- .../kernels/NEPixelWiseMultiplicationKernel.cpp | 64 +++++++++++++++++++--- 3 files changed, 147 insertions(+), 13 deletions(-) (limited to 'src/core') diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp index c0809eb9fa..7f7e45a940 100644 --- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp +++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" @@ -45,6 +46,38 @@ class Coordinates; namespace { +void add_wrap_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const qint8x16_t a = vld1q_qs8(reinterpret_cast(input1.ptr())); + const qint8x16_t b = vld1q_qs8(reinterpret_cast(input2.ptr())); + + vst1q_qs8(reinterpret_cast(output.ptr()), vaddq_qs8(a, b)); + }, + input1, input2, output); +} + +void add_saturate_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const qint8x16_t a = vld1q_qs8(reinterpret_cast(input1.ptr())); + const qint8x16_t b = vld1q_qs8(reinterpret_cast(input2.ptr())); + + vst1q_qs8(reinterpret_cast(output.ptr()), vqaddq_qs8(a, b)); + }, + input1, input2, output); +} + void add_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { Iterator input1(in1, window); @@ -352,14 +385,21 @@ void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor } ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8), "Output can only be U8 if both inputs are U8"); + if(is_data_type_fixed_point(input1->info()->data_type()) || is_data_type_fixed_point(input2->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type())) + { + // Check that all data types are the same and all fixed-point positions are the same + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output); + } static std::map map_function = { + { "add_wrap_QS8_QS8_QS8", &add_wrap_QS8_QS8_QS8 }, + { "add_saturate_QS8_QS8_QS8", &add_saturate_QS8_QS8_QS8 }, { "add_wrap_U8_U8_U8", &add_wrap_U8_U8_U8 }, { "add_saturate_U8_U8_U8", &add_saturate_U8_U8_U8 }, { "add_wrap_S16_U8_S16", &add_wrap_S16_U8_S16 }, @@ -368,6 +408,8 @@ void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor { "add_saturate_U8_S16_S16", &add_saturate_U8_S16_S16 }, { "add_wrap_U8_U8_S16", &add_wrap_U8_U8_S16 }, { "add_saturate_U8_U8_S16", &add_saturate_U8_U8_S16 }, + { "add_wrap_QS16_QS16_QS16", &add_wrap_S16_S16_S16 }, + { "add_saturate_QS16_QS16_QS16", &add_saturate_S16_S16_S16 }, { "add_wrap_S16_S16_S16", &add_wrap_S16_S16_S16 }, { "add_saturate_S16_S16_S16", &add_saturate_S16_S16_S16 }, { "add_wrap_F32_F32_F32", &add_F32_F32_F32 }, diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp index f51b6b9f0b..cac2a6bd05 100644 --- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp +++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" @@ -44,6 +45,38 @@ class Coordinates; namespace { +void sub_wrap_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const qint8x16_t a = vld1q_qs8(reinterpret_cast(input1.ptr())); + const qint8x16_t b = vld1q_qs8(reinterpret_cast(input2.ptr())); + + vst1q_qs8(reinterpret_cast(output.ptr()), vsubq_qs8(a, b)); + }, + input1, input2, output); +} + +void sub_saturate_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const qint8x16_t a = vld1q_qs8(reinterpret_cast(input1.ptr())); + const qint8x16_t b = vld1q_qs8(reinterpret_cast(input2.ptr())); + + vst1q_qs8(reinterpret_cast(output.ptr()), vqsubq_qs8(a, b)); + }, + input1, input2, output); +} + void sub_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { Iterator input1(in1, window); @@ -302,14 +335,21 @@ void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITens } ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F32); ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8), "Output can only be U8 if both inputs are U8"); + if(is_data_type_fixed_point(input1->info()->data_type()) || is_data_type_fixed_point(input2->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type())) + { + // Check that all data types are the same and all fixed-point positions are the same + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output); + } static std::map map_function = { + { "sub_wrap_QS8_QS8_QS8", &sub_wrap_QS8_QS8_QS8 }, + { "sub_saturate_QS8_QS8_QS8", &sub_saturate_QS8_QS8_QS8 }, { "sub_wrap_U8_U8_U8", &sub_wrap_U8_U8_U8 }, { "sub_wrap_U8_U8_S16", &sub_wrap_U8_U8_S16 }, { "sub_saturate_U8_U8_U8", &sub_saturate_U8_U8_U8 }, @@ -318,6 +358,8 @@ void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITens { "sub_wrap_S16_U8_S16", &sub_wrap_S16_U8_S16 }, { "sub_saturate_U8_S16_S16", &sub_saturate_U8_S16_S16 }, { "sub_saturate_S16_U8_S16", &sub_saturate_S16_U8_S16 }, + { "sub_wrap_QS16_QS16_QS16", &sub_wrap_S16_S16_S16 }, + { "sub_saturate_QS16_QS16_QS16", &sub_saturate_S16_S16_S16 }, { "sub_wrap_S16_S16_S16", &sub_wrap_S16_S16_S16 }, { "sub_saturate_S16_S16_S16", &sub_saturate_S16_S16_S16 }, { "sub_wrap_F32_F32_F32", &sub_F32_F32_F32 }, diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp index 83d6d8218e..150db39695 100644 --- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp +++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp @@ -147,6 +147,46 @@ void mul_QS8_QS8_QS8_n(const void *__restrict input1_ptr, const void *__restrict vst1q_s8(output, res); } +template +void mul_QS16_QS16_QS16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position) +{ + // n is the exponent of the scaling factor, that is scale = 1/2^n. Currently, we only support scaling factor equal to 1 => n = 0. + ARM_COMPUTE_ERROR_ON_MSG(n != 0, "Scaling factor different than 1 not supported for 16-bit fixed-point pixel-wise multiplication"); + ARM_COMPUTE_UNUSED(n); + + const qint16x8x2_t ta1 = vld2q_qs16(static_cast(input1_ptr)); + const qint16x8x2_t ta2 = vld2q_qs16(static_cast(input2_ptr)); + + if(is_sat) + { + const qint16x8x2_t res = + { + { + // First 8 elements + vqmulq_qs16(ta1.val[0], ta2.val[0], fixed_point_position), + // Second 8 elements + vqmulq_qs16(ta1.val[1], ta2.val[1], fixed_point_position) + } + }; + + vst2q_s16(static_cast(output_ptr), res); + } + else + { + const qint16x8x2_t res = + { + { + // First 8 elements + vmulq_qs16(ta1.val[0], ta2.val[0], fixed_point_position), + // Second 8 elements + vmulq_qs16(ta1.val[1], ta2.val[1], fixed_point_position) + } + }; + + vst2q_s16(static_cast(output_ptr), res); + } +} + template inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &input1, const int16x8_t &input2, int n) { @@ -389,16 +429,15 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe } ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8), "Output can only be U8 if both inputs are U8"); - if(input1->info()->data_type() == DataType::QS8) + if(is_data_type_fixed_point(input1->info()->data_type()) || is_data_type_fixed_point(input2->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type())) { - // All data types must be QS8 - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input1, input2, output); + // Check that all data types are the same and all fixed-point positions are the same + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output); } _input1 = input1; @@ -513,6 +552,17 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe _func_q_int = is_sat ? &mul_QS8_QS8_QS8_n : &mul_QS8_QS8_QS8_n; } } + else if(DataType::QS16 == dt_input1 && DataType::QS16 == dt_input2 && DataType::QS16 == dt_output) + { + if(is_scale_255) + { + _func_q_int = is_sat ? &mul_QS16_QS16_QS16_n : &mul_QS16_QS16_QS16_n; + } + else + { + _func_q_int = is_sat ? &mul_QS16_QS16_QS16_n : &mul_QS16_QS16_QS16_n; + } + } else if(DataType::F16 == dt_input1 && DataType::F16 == dt_input2 && DataType::F16 == dt_output) { _func_float = &mul_F16_F16_F16_n; -- cgit v1.2.1