diff options
author | Ramy Elgammal <ramy.elgammal@arm.com> | 2024-05-14 15:21:07 +0100 |
---|---|---|
committer | Pablo Marquez Tello <pablo.tello@arm.com> | 2024-05-14 18:35:14 +0000 |
commit | b4b61a649deef5374fa070b79562ae83d7f64f23 (patch) | |
tree | 946ef0967cb3a5575f3e3a76eeaee4cd42b25bce | |
parent | 2217f1e60964fe586cae7ef996af7ef1c0bef2ab (diff) | |
download | ComputeLibrary-b4b61a649deef5374fa070b79562ae83d7f64f23.tar.gz |
Rework CpuQuantizeKernel to enable FP16 in multi_isa builds
Resolves: COMPMID-7054
Signed-off-by: Ramy Elgammal <ramy.elgammal@arm.com>
Change-Id: I68d125b81ad7f74b2594ccda8d6ec08beef1ebd7
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11555
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r-- | Android.bp | 4 | ||||
-rw-r--r-- | docs/user_guide/release_version_and_change_log.dox | 2 | ||||
-rw-r--r-- | filelist.json | 10 | ||||
-rw-r--r-- | src/BUILD.bazel | 4 | ||||
-rw-r--r-- | src/CMakeLists.txt | 4 | ||||
-rw-r--r-- | src/cpu/kernels/CpuQuantizeKernel.cpp | 344 | ||||
-rw-r--r-- | src/cpu/kernels/CpuQuantizeKernel.h | 26 | ||||
-rw-r--r-- | src/cpu/kernels/quantize/generic/neon/fp16.cpp | 45 | ||||
-rw-r--r-- | src/cpu/kernels/quantize/generic/neon/fp32.cpp | 48 | ||||
-rw-r--r-- | src/cpu/kernels/quantize/generic/neon/impl.h | 302 | ||||
-rw-r--r-- | src/cpu/kernels/quantize/generic/neon/impl_fp16.h | 46 | ||||
-rw-r--r-- | src/cpu/kernels/quantize/generic/neon/impl_fp32.h | 44 | ||||
-rw-r--r-- | src/cpu/kernels/quantize/generic/neon/integer.cpp | 82 | ||||
-rw-r--r-- | src/cpu/kernels/quantize/generic/neon/list.h | 66 | ||||
-rw-r--r-- | src/cpu/kernels/quantize/generic/neon/vquantize.cpp | 41 |
15 files changed, 722 insertions, 346 deletions
diff --git a/Android.bp b/Android.bp index c6e89b02c4..fd8afffcb0 100644 --- a/Android.bp +++ b/Android.bp @@ -554,6 +554,10 @@ cc_library_static { "src/cpu/kernels/pool3d/neon/fp32.cpp", "src/cpu/kernels/pool3d/neon/qasymm8.cpp", "src/cpu/kernels/pool3d/neon/qasymm8_signed.cpp", + "src/cpu/kernels/quantize/generic/neon/fp16.cpp", + "src/cpu/kernels/quantize/generic/neon/fp32.cpp", + "src/cpu/kernels/quantize/generic/neon/integer.cpp", + "src/cpu/kernels/quantize/generic/neon/vquantize.cpp", "src/cpu/kernels/range/generic/neon/fp16.cpp", "src/cpu/kernels/range/generic/neon/fp32.cpp", "src/cpu/kernels/range/generic/neon/integer.cpp", diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox index 9c3eb8e2e9..f493ff631e 100644 --- a/docs/user_guide/release_version_and_change_log.dox +++ b/docs/user_guide/release_version_and_change_log.dox @@ -43,7 +43,7 @@ If there is more than one release in a month then an extra sequential number is v24.05 Public major release - Add @ref CLScatter operator for FP32/16, S32/16/8, U32/16/8 data types - - Fix @ref NEReductionOperationKernel FP16 for armv8a multi_isa builds + - Various fixes to enable FP16 kernels in armv8a multi_isa builds. v24.04 Public major release - Add Bfloat16 data type support for @ref NEMatMul. diff --git a/filelist.json b/filelist.json index 5246f27f68..3ee53042f9 100644 --- a/filelist.json +++ b/filelist.json @@ -2093,8 +2093,14 @@ "common": [ "src/cpu/operators/CpuQuantize.cpp", "src/cpu/kernels/CpuQuantizeKernel.cpp", - "src/runtime/NEON/functions/NEQuantizationLayer.cpp" - ] + "src/runtime/NEON/functions/NEQuantizationLayer.cpp", + "src/cpu/kernels/quantize/generic/neon/vquantize.cpp" + ], + "neon":{ + "fp32":["src/cpu/kernels/quantize/generic/neon/fp32.cpp"], + "fp16":["src/cpu/kernels/quantize/generic/neon/fp16.cpp"], + "integer":["src/cpu/kernels/quantize/generic/neon/integer.cpp"] + } } }, "Range": { diff --git a/src/BUILD.bazel b/src/BUILD.bazel index 2648a968ae..499e5642a6 100644 --- a/src/BUILD.bazel +++ b/src/BUILD.bazel @@ -818,6 +818,10 @@ filegroup( "cpu/kernels/pool3d/neon/fp32.cpp", "cpu/kernels/pool3d/neon/qasymm8.cpp", "cpu/kernels/pool3d/neon/qasymm8_signed.cpp", + "cpu/kernels/quantize/generic/neon/fp16.cpp", + "cpu/kernels/quantize/generic/neon/fp32.cpp", + "cpu/kernels/quantize/generic/neon/integer.cpp", + "cpu/kernels/quantize/generic/neon/vquantize.cpp", "cpu/kernels/range/generic/neon/fp16.cpp", "cpu/kernels/range/generic/neon/fp32.cpp", "cpu/kernels/range/generic/neon/integer.cpp", diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e47b5cbb4b..8d63ab57a3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -809,6 +809,10 @@ target_sources( cpu/kernels/pool3d/neon/fp32.cpp cpu/kernels/pool3d/neon/qasymm8.cpp cpu/kernels/pool3d/neon/qasymm8_signed.cpp + cpu/kernels/quantize/generic/neon/fp16.cpp + cpu/kernels/quantize/generic/neon/fp32.cpp + cpu/kernels/quantize/generic/neon/integer.cpp + cpu/kernels/quantize/generic/neon/vquantize.cpp cpu/kernels/range/generic/neon/fp16.cpp cpu/kernels/range/generic/neon/fp32.cpp cpu/kernels/range/generic/neon/integer.cpp diff --git a/src/cpu/kernels/CpuQuantizeKernel.cpp b/src/cpu/kernels/CpuQuantizeKernel.cpp index d2ac6cf8ac..ed4675ae3d 100644 --- a/src/cpu/kernels/CpuQuantizeKernel.cpp +++ b/src/cpu/kernels/CpuQuantizeKernel.cpp @@ -29,12 +29,12 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/quantize/generic/neon/list.h" #include <arm_neon.h> #include <map> @@ -47,7 +47,6 @@ namespace kernels { namespace { -constexpr auto window_step = 16; Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) { @@ -63,59 +62,6 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) return Status{}; } -template <typename T> -inline float32x4x4_t load_value(const T *input_ptr) -{ - using Tx16_t = typename wrapper::traits::neon_vector<T, 16>::type; - return arm_compute::convert_to_float32x4x4<Tx16_t>(wrapper::vloadq(input_ptr)); -} - -template <> -inline float32x4x4_t load_value(const float *input_ptr) -{ - return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), wrapper::vloadq(input_ptr + 8), - wrapper::vloadq(input_ptr + 12)}; -} -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template <> -inline float32x4x4_t load_value(const float16_t *input_ptr) -{ - return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)), - vcvt_f32_f16(wrapper::vload(input_ptr + 8)), vcvt_f32_f16(wrapper::vload(input_ptr + 12))}; -} - -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -template <typename element_type> -using vector_type = wrapper::traits::neon_vector_t<element_type, window_step>; - -template <typename quantized_type> -vector_type<quantized_type> vquantize_qasymm8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi); - -template <> -vector_type<uint8_t> vquantize_qasymm8<uint8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) -{ - return vquantize(qv, qi); -} - -template <> -vector_type<int8_t> vquantize_qasymm8<int8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) -{ - return vquantize_signed(qv, qi); -} - -template <typename TOut, typename = typename std::enable_if<std::is_signed<TOut>::value, bool>::type> -inline int8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper) -{ - return wrapper::vcombine(wrapper::vqmovn(lower), wrapper::vqmovn(upper)); -} - -template <typename TOut, typename = typename std::enable_if<std::is_unsigned<TOut>::value, bool>::type> -inline uint8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper) -{ - return wrapper::vcombine(wrapper::vqmovun(lower), wrapper::vqmovun(upper)); -} - } // namespace void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) @@ -124,38 +70,36 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); static const std::map<std::string, QuantizeFunctionExecutorPtr> quant_map = { - {"op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, uint8_t>}, - {"op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, int8_t>}, - {"op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<uint8_t>}, + {"op_QASYMM8_QASYMM8", REGISTER_INTEGER_NEON(u8_u8_run_quantize_qasymm8)}, + {"op_QASYMM8_QASYMM8_SIGNED", REGISTER_INTEGER_NEON(u8_i8_run_quantize_qasymm8)}, + {"op_QASYMM8_QASYMM16", REGISTER_INTEGER_NEON(u8_run_quantize_qasymm16)}, - {"op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, uint8_t>}, - {"op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t>}, - {"op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t>}, + {"op_QASYMM8_SIGNED_QASYMM8", REGISTER_INTEGER_NEON(i8_u8_run_quantize_qasymm8)}, + {"op_QASYMM8_SIGNED_QASYMM8_SIGNED", REGISTER_INTEGER_NEON(i8_i8_run_quantize_qasymm8)}, + {"op_QASYMM8_SIGNED_QASYMM16", REGISTER_INTEGER_NEON(i8_run_quantize_qasymm16)}, // Functions for offset only requantization - {"op_OFFSET_ONLY_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_requantize_offset_only<uint8_t, uint8_t>}, - {"op_OFFSET_ONLY_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_requantize_offset_only<uint8_t, int8_t>}, - {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_requantize_offset_only<int8_t, uint8_t>}, - {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8_SIGNED", - &CpuQuantizeKernel::run_requantize_offset_only<int8_t, int8_t>}, + {"op_OFFSET_ONLY_QASYMM8_QASYMM8", REGISTER_INTEGER_NEON(u8_u8_run_requantize_offset_only)}, + {"op_OFFSET_ONLY_QASYMM8_QASYMM8_SIGNED", REGISTER_INTEGER_NEON(u8_i8_run_requantize_offset_only)}, + {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8", REGISTER_INTEGER_NEON(i8_u8_run_requantize_offset_only)}, + {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8_SIGNED", REGISTER_INTEGER_NEON(i8_i8_run_requantize_offset_only)}, // Functions for offset uint8 to int8 and vice versa quantization (no scale changes) {"op_OFFSET_ONLY_CONVERT_QASYMM8_SIGNED_QASYMM8", - &CpuQuantizeKernel::run_requantize_offset_only_convert<int8_t, uint8_t>}, + REGISTER_INTEGER_NEON(i8_u8_run_requantize_offset_only_convert)}, {"op_OFFSET_ONLY_CONVERT_QASYMM8_QASYMM8_SIGNED", - &CpuQuantizeKernel::run_requantize_offset_only_convert<uint8_t, int8_t>}, - - {"op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8<float, int8_t>}, - - {"op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t>}, - {"op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float, int8_t>}, - {"op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float>}, - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - {"op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, uint8_t>}, - {"op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, int8_t>}, - {"op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float16_t>}, -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/ + REGISTER_INTEGER_NEON(u8_i8_run_requantize_offset_only_convert)}, + + {"op_F32_QSYMM8", REGISTER_FP32_NEON(fp32_i8_run_quantize_qsymm8)}, + {"op_F32_QASYMM8", REGISTER_FP32_NEON(fp32_u8_run_quantize_qasymm8)}, + {"op_F32_QASYMM8_SIGNED", REGISTER_FP32_NEON(fp32_i8_run_quantize_qasymm8)}, + {"op_F32_QASYMM16", REGISTER_FP32_NEON(fp32_run_quantize_qasymm16)}, + +#ifdef ARM_COMPUTE_ENABLE_FP16 + {"op_F16_QASYMM8", REGISTER_FP16_NEON(fp16_u8_run_quantize_qasymm8)}, + {"op_F16_QASYMM8_SIGNED", REGISTER_FP16_NEON(fp16_i8_run_quantize_qasymm8)}, + {"op_F16_QASYMM16", REGISTER_FP16_NEON(fp16_run_quantize_qasymm16)}, +#endif /* ARM_COMPUTE_ENABLE_FP16 */ }; std::string function_to_call("op_"); @@ -203,242 +147,6 @@ Status CpuQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *ds return Status{}; } -template <typename TIn, typename TOut> -void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window) -{ - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); - UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); - uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); - - // Collapse window and reset first dimension to handle tail calculations manually - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - execute_window_loop( - win_collapsed, - [&](const Coordinates &) - { - auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); - auto output_ptr = reinterpret_cast<TOut *>(output.ptr()); - int x = window_start_x; - for (; x <= (window_end_x - window_step); x += window_step) - { - wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo)); - } - // Compute left-over elements - for (; x < window_end_x; ++x) - { - output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info()); - } - }, - input, output); -} - -template <typename TIn, typename TOut> -void CpuQuantizeKernel::run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window) -{ - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - // Calculate output offset difference. - const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); - UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); - uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); - - // Collapse window and reset first dimension to handle tail calculations manually - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Duplicate offset in signed vector format - const int8x16_t offset = wrapper::vdup_n(static_cast<int8_t>(uqinfo.offset), wrapper::traits::vector_128_tag{}); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - execute_window_loop( - win_collapsed, - [&](const Coordinates &) - { - auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); - auto output_ptr = reinterpret_cast<TOut *>(output.ptr()); - int x = window_start_x; - for (; x <= (window_end_x - window_step); x += window_step) - { - const wrapper::traits::neon_vector_t<TIn, window_step> qv = - wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype - - // Signed addition. - auto res = vaddq_s8(reinterpret_cast<int8x16_t>(qv), offset); - - // Output is dependent on datatype. - wrapper::vstore(&output_ptr[x], - reinterpret_cast<wrapper::traits::neon_vector_t<TOut, window_step>>(res)); - } - // Compute left-over elements - for (; x < window_end_x; ++x) - { - auto result = uqinfo.offset + static_cast<int32_t>(input_ptr[x]); - output_ptr[x] = static_cast<TOut>(result); - } - }, - input, output); -} - -template <typename TIn, typename TOut> -void CpuQuantizeKernel::run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window) -{ - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); - UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); - uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); - - // Collapse window and reset first dimension to handle tail calculations manually - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Duplicate offset in signed vector format - const int16x8_t offset = wrapper::vdup_n(static_cast<int16_t>(uqinfo.offset), wrapper::traits::vector_128_tag{}); - - const int32_t low_bound = (dst->info()->data_type() == DataType::QASYMM8) ? 0 : -128; - const int32_t upper_bound = (dst->info()->data_type() == DataType::QASYMM8) ? 255 : 127; - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - execute_window_loop( - win_collapsed, - [&](const Coordinates &) - { - auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); - TOut *output_ptr = reinterpret_cast<TOut *>(output.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step); x += window_step) - { - const auto qv = wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype - int16x8_t lower = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgetlow(qv))); - int16x8_t upper = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgethigh(qv))); - - // Signed addition. - lower = wrapper::vqadd(lower, offset); - upper = wrapper::vqadd(upper, offset); - - // Output is dependent on datatype. - auto res = recombine_8_16<TOut>(lower, upper); - wrapper::vstore(&output_ptr[x], res); - } - // Compute left-over elements - for (; x < window_end_x; ++x) - { - // Add offset and clamp result to within the range of the output datatype. - int32_t result = uqinfo.offset + static_cast<int32_t>(input_ptr[x]); - result = utility::clamp<int32_t>(result, low_bound, upper_bound); - - // Cast result to output datatype. - output_ptr[x] = static_cast<TOut>(result); - } - }, - input, output); -} - -template <typename TIn, typename TOut> -void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) -{ - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); - UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); - if (is_data_type_quantized_asymmetric(src->info()->data_type())) - { - uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); - } -#ifdef __aarch64__ - constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; -#else //__aarch64__ - constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; -#endif //__aarch64__ - - // Collapse window and reset first dimension to handle tail calculations manually - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - execute_window_loop( - win_collapsed, - [&](const Coordinates &) - { - auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); - auto output_ptr = reinterpret_cast<TOut *>(output.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step); x += window_step) - { - wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo)); - } - // Compute left-over elements - for (; x < window_end_x; ++x) - { - output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy); - } - }, - input, output); -} - -template <typename T> -void CpuQuantizeKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window) -{ - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); - UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); - if (is_data_type_quantized_asymmetric(src->info()->data_type())) - { - uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); - } -#ifdef __aarch64__ - constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; -#else //__aarch64__ - constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; -#endif //__aarch64__ - - // Collapse window and reset first dimension to handle tail calculations manually - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(src, win_collapsed); - Iterator output(dst, win_collapsed); - execute_window_loop( - win_collapsed, - [&](const Coordinates &) - { - auto input_ptr = reinterpret_cast<const T *>(input.ptr()); - auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr()); - - int x = window_start_x; - for (; x <= (window_end_x - window_step); x += window_step) - { - uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo); - vst1q_u16(&output_ptr[x], tmp.val[0]); - vst1q_u16(&output_ptr[x + 8], tmp.val[1]); - } - // Compute left-over elements - for (; x < window_end_x; ++x) - { - output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy); - } - }, - input, output); -} - void CpuQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); @@ -448,7 +156,7 @@ void CpuQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, const const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); auto dst = tensors.get_tensor(TensorType::ACL_DST); - (this->*_func)(src, dst, window); + (*_func)(src, dst, window); } const char *CpuQuantizeKernel::name() const diff --git a/src/cpu/kernels/CpuQuantizeKernel.h b/src/cpu/kernels/CpuQuantizeKernel.h index c2f7ac6d9d..750310c811 100644 --- a/src/cpu/kernels/CpuQuantizeKernel.h +++ b/src/cpu/kernels/CpuQuantizeKernel.h @@ -76,31 +76,7 @@ private: * * @param[in] window Region on which to execute the kernel. */ - using QuantizeFunctionExecutorPtr = void (CpuQuantizeKernel::*)(const ITensor *src, - ITensor *dst, - const Window &window); - /** Function to apply QASYMM8 or QASYMM8_SIGNED quantization on a tensor. - * - * @param[in] window Region on which to execute the kernel. - */ - template <typename TIn, typename TOut> - void run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window); - /** Function to apply QASYMM16 quantization on a tensor. - * - * @param[in] window Region on which to execute the kernel. - */ - template <typename T> - void run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window); - - template <typename TIn, typename TOut> - void run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window); - - template <typename TIn, typename TOut> - void run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window); - - template <typename TIn, typename TOut> - void run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window); - + using QuantizeFunctionExecutorPtr = void (*)(const ITensor *src, ITensor *dst, const Window &window); QuantizeFunctionExecutorPtr _func{nullptr}; size_t _split_dimension{Window::DimY}; }; diff --git a/src/cpu/kernels/quantize/generic/neon/fp16.cpp b/src/cpu/kernels/quantize/generic/neon/fp16.cpp new file mode 100644 index 0000000000..456a3bda31 --- /dev/null +++ b/src/cpu/kernels/quantize/generic/neon/fp16.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +#include "src/cpu/kernels/quantize/generic/neon/impl_fp16.h" + +namespace arm_compute +{ +namespace cpu +{ +void fp16_u8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<float16_t, uint8_t>(src, dst, window); +} +void fp16_i8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<float16_t, int8_t>(src, dst, window); +} +void fp16_run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm16<float16_t>(src, dst, window); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/quantize/generic/neon/fp32.cpp b/src/cpu/kernels/quantize/generic/neon/fp32.cpp new file mode 100644 index 0000000000..15f52b2238 --- /dev/null +++ b/src/cpu/kernels/quantize/generic/neon/fp32.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/quantize/generic/neon/impl_fp32.h" + +namespace arm_compute +{ +namespace cpu +{ +void fp32_u8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<float, uint8_t>(src, dst, window); +} +void fp32_i8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<float, int8_t>(src, dst, window); +} +void fp32_run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm16<float>(src, dst, window); +} + +void fp32_i8_run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qsymm8<float, int8_t>(src, dst, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/quantize/generic/neon/impl.h b/src/cpu/kernels/quantize/generic/neon/impl.h new file mode 100644 index 0000000000..1861fca391 --- /dev/null +++ b/src/cpu/kernels/quantize/generic/neon/impl.h @@ -0,0 +1,302 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_H +#define ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_H + +#include "arm_compute/core/Helpers.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + +namespace arm_compute +{ +namespace cpu +{ +constexpr auto window_step = 16; + +template <typename T> +inline float32x4x4_t load_value(const T *input_ptr) +{ + using Tx16_t = typename wrapper::traits::neon_vector<T, 16>::type; + return arm_compute::convert_to_float32x4x4<Tx16_t>(wrapper::vloadq(input_ptr)); +} + +template <typename element_type> +using vector_type = wrapper::traits::neon_vector_t<element_type, window_step>; + +template <typename quantized_type> +vector_type<quantized_type> vquantize_qasymm8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi); + +template <typename TOut, typename = typename std::enable_if<std::is_signed<TOut>::value, bool>::type> +inline int8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper) +{ + return wrapper::vcombine(wrapper::vqmovn(lower), wrapper::vqmovn(upper)); +} + +template <typename TOut, typename = typename std::enable_if<std::is_unsigned<TOut>::value, bool>::type> +inline uint8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper) +{ + return wrapper::vcombine(wrapper::vqmovun(lower), wrapper::vqmovun(upper)); +} + +template <typename TIn, typename TOut> +void run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); + UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); + uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); + auto output_ptr = reinterpret_cast<TOut *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info()); + } + }, + input, output); +} + +template <typename TIn, typename TOut> +void run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Calculate output offset difference. + const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); + UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); + uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Duplicate offset in signed vector format + const int8x16_t offset = wrapper::vdup_n(static_cast<int8_t>(uqinfo.offset), wrapper::traits::vector_128_tag{}); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); + auto output_ptr = reinterpret_cast<TOut *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + const wrapper::traits::neon_vector_t<TIn, window_step> qv = + wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype + + // Signed addition. + auto res = vaddq_s8(reinterpret_cast<int8x16_t>(qv), offset); + + // Output is dependent on datatype. + wrapper::vstore(&output_ptr[x], + reinterpret_cast<wrapper::traits::neon_vector_t<TOut, window_step>>(res)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + auto result = uqinfo.offset + static_cast<int32_t>(input_ptr[x]); + output_ptr[x] = static_cast<TOut>(result); + } + }, + input, output); +} + +template <typename TIn, typename TOut> +void run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); + UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); + uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Duplicate offset in signed vector format + const int16x8_t offset = wrapper::vdup_n(static_cast<int16_t>(uqinfo.offset), wrapper::traits::vector_128_tag{}); + + const int32_t low_bound = (dst->info()->data_type() == DataType::QASYMM8) ? 0 : -128; + const int32_t upper_bound = (dst->info()->data_type() == DataType::QASYMM8) ? 255 : 127; + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); + TOut *output_ptr = reinterpret_cast<TOut *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + const auto qv = wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype + int16x8_t lower = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgetlow(qv))); + int16x8_t upper = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgethigh(qv))); + + // Signed addition. + lower = wrapper::vqadd(lower, offset); + upper = wrapper::vqadd(upper, offset); + + // Output is dependent on datatype. + auto res = recombine_8_16<TOut>(lower, upper); + wrapper::vstore(&output_ptr[x], res); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + // Add offset and clamp result to within the range of the output datatype. + int32_t result = uqinfo.offset + static_cast<int32_t>(input_ptr[x]); + result = utility::clamp<int32_t>(result, low_bound, upper_bound); + + // Cast result to output datatype. + output_ptr[x] = static_cast<TOut>(result); + } + }, + input, output); +} + +template <typename TIn, typename TOut> +void run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); + UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); + if (is_data_type_quantized_asymmetric(src->info()->data_type())) + { + uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + } +#ifdef __aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +#else //__aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; +#endif //__aarch64__ + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); + auto output_ptr = reinterpret_cast<TOut *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy); + } + }, + input, output); +} + +template <typename T> +void run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); + UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); + if (is_data_type_quantized_asymmetric(src->info()->data_type())) + { + uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + } +#ifdef __aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +#else //__aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; +#endif //__aarch64__ + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo); + vst1q_u16(&output_ptr[x], tmp.val[0]); + vst1q_u16(&output_ptr[x + 8], tmp.val[1]); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy); + } + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_H diff --git a/src/cpu/kernels/quantize/generic/neon/impl_fp16.h b/src/cpu/kernels/quantize/generic/neon/impl_fp16.h new file mode 100644 index 0000000000..47f1b90abd --- /dev/null +++ b/src/cpu/kernels/quantize/generic/neon/impl_fp16.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_FP16_H +#define ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_FP16_H +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" + +namespace arm_compute +{ +namespace cpu +{ + +inline float32x4x4_t load_value(const float16_t *input_ptr) +{ + return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)), + vcvt_f32_f16(wrapper::vload(input_ptr + 8)), vcvt_f32_f16(wrapper::vload(input_ptr + 12))}; +} + +} // namespace cpu +} // namespace arm_compute +#include "impl.h" +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#endif // ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_FP16_H diff --git a/src/cpu/kernels/quantize/generic/neon/impl_fp32.h b/src/cpu/kernels/quantize/generic/neon/impl_fp32.h new file mode 100644 index 0000000000..00ae242567 --- /dev/null +++ b/src/cpu/kernels/quantize/generic/neon/impl_fp32.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_FP32_H +#define ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_FP32_H + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" + +namespace arm_compute +{ +namespace cpu +{ +inline float32x4x4_t load_value(const float *input_ptr) +{ + return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), wrapper::vloadq(input_ptr + 8), + wrapper::vloadq(input_ptr + 12)}; +} + +} // namespace cpu +} // namespace arm_compute + +#include "impl.h" +#endif // ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_FP32_H diff --git a/src/cpu/kernels/quantize/generic/neon/integer.cpp b/src/cpu/kernels/quantize/generic/neon/integer.cpp new file mode 100644 index 0000000000..4e39afaaee --- /dev/null +++ b/src/cpu/kernels/quantize/generic/neon/integer.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/quantize/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void u8_u8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<uint8_t, uint8_t>(src, dst, window); +} +void u8_i8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<uint8_t, int8_t>(src, dst, window); +} +void i8_u8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<int8_t, uint8_t>(src, dst, window); +} +void i8_i8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<int8_t, int8_t>(src, dst, window); +} + +void u8_run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm16<uint8_t>(src, dst, window); +} +void i8_run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm16<int8_t>(src, dst, window); +} + +void u8_u8_run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window) +{ + run_requantize_offset_only<uint8_t, uint8_t>(src, dst, window); +} +void u8_i8_run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window) +{ + run_requantize_offset_only<uint8_t, int8_t>(src, dst, window); +} +void i8_u8_run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window) +{ + run_requantize_offset_only<int8_t, uint8_t>(src, dst, window); +} +void i8_i8_run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window) +{ + run_requantize_offset_only<int8_t, int8_t>(src, dst, window); +} + +void i8_u8_run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window) +{ + run_requantize_offset_only_convert<int8_t, uint8_t>(src, dst, window); +} +void u8_i8_run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window) +{ + run_requantize_offset_only_convert<uint8_t, int8_t>(src, dst, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/quantize/generic/neon/list.h b/src/cpu/kernels/quantize/generic/neon/list.h new file mode 100644 index 0000000000..c4fb1048eb --- /dev/null +++ b/src/cpu/kernels/quantize/generic/neon/list.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_LIST_H +#define ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_LIST_H + +#include "arm_compute/core/Helpers.h" + +namespace arm_compute +{ +namespace cpu +{ + +#define DECLARE_QUANTIZE_KERNEL(func_name) void func_name(const ITensor *src, ITensor *dst, const Window &window) + +DECLARE_QUANTIZE_KERNEL(u8_u8_run_quantize_qasymm8); +DECLARE_QUANTIZE_KERNEL(u8_i8_run_quantize_qasymm8); +DECLARE_QUANTIZE_KERNEL(i8_u8_run_quantize_qasymm8); +DECLARE_QUANTIZE_KERNEL(i8_i8_run_quantize_qasymm8); + +DECLARE_QUANTIZE_KERNEL(u8_u8_run_requantize_offset_only); +DECLARE_QUANTIZE_KERNEL(u8_i8_run_requantize_offset_only); +DECLARE_QUANTIZE_KERNEL(i8_u8_run_requantize_offset_only); +DECLARE_QUANTIZE_KERNEL(i8_i8_run_requantize_offset_only); + +DECLARE_QUANTIZE_KERNEL(i8_u8_run_requantize_offset_only_convert); +DECLARE_QUANTIZE_KERNEL(u8_i8_run_requantize_offset_only_convert); + +DECLARE_QUANTIZE_KERNEL(u8_run_quantize_qasymm16); +DECLARE_QUANTIZE_KERNEL(i8_run_quantize_qasymm16); + +DECLARE_QUANTIZE_KERNEL(fp32_u8_run_quantize_qasymm8); +DECLARE_QUANTIZE_KERNEL(fp32_i8_run_quantize_qasymm8); +DECLARE_QUANTIZE_KERNEL(fp32_run_quantize_qasymm16); + +DECLARE_QUANTIZE_KERNEL(fp32_i8_run_quantize_qsymm8); + +DECLARE_QUANTIZE_KERNEL(fp16_u8_run_quantize_qasymm8); +DECLARE_QUANTIZE_KERNEL(fp16_i8_run_quantize_qasymm8); +DECLARE_QUANTIZE_KERNEL(fp16_run_quantize_qasymm16); + +#undef DECLARE_QUANTIZE_KERNEL + +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_LIST_H diff --git a/src/cpu/kernels/quantize/generic/neon/vquantize.cpp b/src/cpu/kernels/quantize/generic/neon/vquantize.cpp new file mode 100644 index 0000000000..d40702bc88 --- /dev/null +++ b/src/cpu/kernels/quantize/generic/neon/vquantize.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "impl.h" +namespace arm_compute +{ +namespace cpu +{ +template <> +vector_type<uint8_t> vquantize_qasymm8<uint8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) +{ + return vquantize(qv, qi); +} + +template <> +vector_type<int8_t> vquantize_qasymm8<int8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) +{ + return vquantize_signed(qv, qi); +} +} // namespace cpu +} // namespace arm_compute |