From d64a46c6dfa81ce4607fc3de57bc9d9ac7e01e4a Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Tue, 1 Oct 2019 12:25:49 +0100 Subject: COMPMID-2699: Add support for QASYMM16 in NEQuantizationLayer Change-Id: Icb968e37551a9048040e9aaff5329e874c53a2ee Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/2016 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas --- arm_compute/core/NEON/NEAsymm.h | 34 +++++++ .../core/NEON/kernels/NEQuantizationLayerKernel.h | 25 +++++- .../runtime/NEON/functions/NEQuantizationLayer.h | 4 +- .../NEON/kernels/NEQuantizationLayerKernel.cpp | 100 ++++++++++++++++----- src/runtime/NEON/functions/NEQuantizationLayer.cpp | 5 +- tests/validation/NEON/QuantizationLayer.cpp | 41 ++++++++- 6 files changed, 176 insertions(+), 33 deletions(-) diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h index f2d20d373a..56d4c09f92 100644 --- a/arm_compute/core/NEON/NEAsymm.h +++ b/arm_compute/core/NEON/NEAsymm.h @@ -331,6 +331,40 @@ inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationIn const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); return vcombine_u8(pa, pb); } + +/** Quantize to QASYMM16 a neon vector holding 16 floating point values. + * + * @param[in] qv Input values to be quantized. + * @param[in] qi Quantization information to be used in the computation. + * + * @return A neon vector holding the quantized values + */ +inline uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) +{ + const float scale = qi.scale; + const int offset = qi.offset; + const float32x4_t voffset = vdupq_n_f32(offset); + const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); + const int32x4x4_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), +#else //__aarch64__ + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), +#endif //__aarch64__ + } + }; + const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1])); + const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3])); + return { pa, pb }; +} } // namespace arm_compute #include "arm_compute/core/NEON/NEAsymm.inl" #endif // __ARM_COMPUTE_NEASYMM_H__ diff --git a/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h index 391a72c6db..e1aaad5094 100644 --- a/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h @@ -57,13 +57,15 @@ public: /** Set the input, output. * * @param[in] input Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: F32/F16. - * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8. + * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM16. + * + * @note Output auto initialization is not supported by this kernel */ void configure(const ITensor *input, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NEQuantizationLayerKernel * * @param[in] input Input tensor info. Data types supported: F32/F16. - * @param[in] output Output tensor info. Data types supported: QASYMM8. + * @param[in] output Output tensor info. Data types supported: QASYMM8/QASYMM16. * * @return a status */ @@ -73,11 +75,28 @@ public: void run(const Window &window, const ThreadInfo &info) override; private: + /** Common signature for all the specialised @ref NEQuantizationLayerKernel functions + * + * @param[in] window Region on which to execute the kernel. + */ + using QuantizationFunctionExecutorPtr = void (NEQuantizationLayerKernel::*)(const Window &window); + /** Function to apply QASYMM8 quantization on a tensor. + * + * @param[in] window Region on which to execute the kernel. + */ template - void quantize(const Window &window, const QuantizationInfo &qinfo); + void run_quantize_qasymm8(const Window &window); + /** Function to apply QASYMM16 quantization on a tensor. + * + * @param[in] window Region on which to execute the kernel. + */ + template + void run_quantize_qasymm16(const Window &window); const ITensor *_input; ITensor *_output; + + QuantizationFunctionExecutorPtr _func; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_NEQUANTIZATIONLAYERKERNEL_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h index 46a62bd903..25609324a0 100644 --- a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h +++ b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h @@ -49,13 +49,13 @@ public: /** Set the input and output tensors. * * @param[in] input Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: F32/F16. - * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QSYMM16 + * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM16 */ void configure(const ITensor *input, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NEQuantizationLayer * * @param[in] input Input tensor info. The dimensions over the third will be interpreted as batches. Data types supported: F32/F16. - * @param[in] output Output tensor info. Data types supported: QASYMM8/QSYMM16 + * @param[in] output Output tensor info. Data types supported: QASYMM8/QASYMM16 * * @return a status */ diff --git a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp index 0aa34cd411..6a9c4ae14c 100644 --- a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp @@ -34,9 +34,10 @@ #include "arm_compute/core/CPP/Validate.h" #include +#include -using namespace arm_compute; - +namespace arm_compute +{ namespace { Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) @@ -45,7 +46,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM16); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); return Status{}; @@ -71,7 +72,7 @@ inline const float32x4x4_t load_value(const float16_t *input_ptr) } // namespace NEQuantizationLayerKernel::NEQuantizationLayerKernel() - : _input(nullptr), _output(nullptr) + : _input(nullptr), _output(nullptr), _func(nullptr) { } @@ -83,6 +84,33 @@ void NEQuantizationLayerKernel::configure(const ITensor *input, ITensor *output) _input = input; _output = output; + static std::map quant_map_f32 = + { + { DataType::QASYMM8, &NEQuantizationLayerKernel::run_quantize_qasymm8 }, + { DataType::QASYMM16, &NEQuantizationLayerKernel::run_quantize_qasymm16 }, + }; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + static std::map quant_map_f16 = + { + { DataType::QASYMM8, &NEQuantizationLayerKernel::run_quantize_qasymm8 }, + { DataType::QASYMM16, &NEQuantizationLayerKernel::run_quantize_qasymm16 }, + }; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/ + + switch(input->info()->data_type()) + { + case DataType::F32: + _func = quant_map_f32[output->info()->data_type()]; + break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + _func = quant_map_f16[output->info()->data_type()]; + break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + default: + ARM_COMPUTE_ERROR("Unsupported input data type."); + } + // Configure kernel window Window win_config = calculate_max_window(*input->info(), Steps()); @@ -96,18 +124,17 @@ void NEQuantizationLayerKernel::configure(const ITensor *input, ITensor *output) Status NEQuantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); - return Status{}; } template -void NEQuantizationLayerKernel::quantize(const Window &window, const QuantizationInfo &qinfo) +void NEQuantizationLayerKernel::run_quantize_qasymm8(const Window &window) { constexpr auto window_step = 16; const auto window_start_x = static_cast(window.x().start()); const auto window_end_x = static_cast(window.x().end()); - const UniformQuantizationInfo uqinfo = qinfo.uniform(); + const UniformQuantizationInfo uqinfo = _output->info()->quantization_info().uniform(); #ifdef __aarch64__ constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; #else //__aarch64__ @@ -139,25 +166,54 @@ void NEQuantizationLayerKernel::quantize(const Window &window, const Quantizatio input, output); } +template +void NEQuantizationLayerKernel::run_quantize_qasymm16(const Window &window) +{ + constexpr auto window_step = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + const UniformQuantizationInfo uqinfo = _output->info()->quantization_info().uniform(); +#ifdef __aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +#else //__aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; +#endif //__aarch64__ + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(_input, win_collapsed); + Iterator output(_output, win_collapsed); + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast(input.ptr()); + auto output_ptr = reinterpret_cast(output.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step); x += window_step) + { + uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo); + vst1q_u16(&output_ptr[x], tmp.val[0]); + vst1q_u16(&output_ptr[x + 8], tmp.val[1]); + } + // Compute left-over elements + for(; x < window_end_x; ++x) + { + output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy); + } + }, + input, output); +} + void NEQuantizationLayerKernel::run(const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); - const QuantizationInfo &qinfo = _output->info()->quantization_info(); - - switch(_input->info()->data_type()) - { - case DataType::F32: - NEQuantizationLayerKernel::quantize(window, qinfo); - break; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - NEQuantizationLayerKernel::quantize(window, qinfo); - break; -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - default: - ARM_COMPUTE_ERROR("Unsupported data type."); - } + (this->*_func)(window); } +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp index 65873b1b14..4464978762 100644 --- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp +++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp @@ -27,8 +27,8 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -using namespace arm_compute; - +namespace arm_compute +{ Status NEQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); @@ -46,3 +46,4 @@ void NEQuantizationLayer::configure(const ITensor *input, ITensor *output) k->configure(input, output); _kernel = std::move(k); } +} // namespace arm_compute diff --git a/tests/validation/NEON/QuantizationLayer.cpp b/tests/validation/NEON/QuantizationLayer.cpp index 8d19c93761..49118f7dc5 100644 --- a/tests/validation/NEON/QuantizationLayer.cpp +++ b/tests/validation/NEON/QuantizationLayer.cpp @@ -43,7 +43,8 @@ namespace validation namespace { /** Tolerance for quantization */ -constexpr AbsoluteTolerance tolerance_u8(1); +constexpr AbsoluteTolerance tolerance_u8(1); +constexpr AbsoluteTolerance tolerance_u16(1); const auto QuantizationSmallShapes = concat(datasets::Small3DShapes(), datasets::Small4DShapes()); const auto QuantizationLargeShapes = concat(datasets::Large3DShapes(), datasets::Large4DShapes()); @@ -98,6 +99,8 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(QuantizationS template using NEQuantizationLayerQASYMM8Fixture = QuantizationValidationFixture; +template +using NEQuantizationLayerQASYMM16Fixture = QuantizationValidationFixture; TEST_SUITE(Float) TEST_SUITE(FP32) @@ -109,6 +112,14 @@ FIXTURE_DATA_TEST_CASE(RunSmallQASYMM8, NEQuantizationLayerQASYMM8Fixture // Validate output validate(Accessor(_target), _reference, tolerance_u8); } +FIXTURE_DATA_TEST_CASE(RunSmallQASYMM16, NEQuantizationLayerQASYMM16Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(QuantizationSmallShapes, + framework::dataset::make("DataType", DataType::F32)), + framework::dataset::make("DataTypeOut", { DataType::QASYMM16 })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) }))) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_u16); +} FIXTURE_DATA_TEST_CASE(RunLargeQASYMM8, NEQuantizationLayerQASYMM8Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(QuantizationLargeShapes, framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataTypeOut", { DataType::QASYMM8 })), @@ -117,10 +128,16 @@ FIXTURE_DATA_TEST_CASE(RunLargeQASYMM8, NEQuantizationLayerQASYMM8Fixture // Validate output validate(Accessor(_target), _reference, tolerance_u8); } +FIXTURE_DATA_TEST_CASE(RunLargeQASYMM16, NEQuantizationLayerQASYMM16Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(QuantizationLargeShapes, + framework::dataset::make("DataType", DataType::F32)), + framework::dataset::make("DataTypeOut", { DataType::QASYMM16 })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) }))) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_u16); +} TEST_SUITE_END() // FP32 -TEST_SUITE_END() // Float #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -TEST_SUITE(Half) TEST_SUITE(FP16) FIXTURE_DATA_TEST_CASE(RunSmallQASYMM8, NEQuantizationLayerQASYMM8Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(QuantizationSmallShapes, framework::dataset::make("DataType", DataType::F16)), @@ -130,6 +147,14 @@ FIXTURE_DATA_TEST_CASE(RunSmallQASYMM8, NEQuantizationLayerQASYMM8Fixture, // Validate output validate(Accessor(_target), _reference, tolerance_u8); } +FIXTURE_DATA_TEST_CASE(RunSmallQASYMM16, NEQuantizationLayerQASYMM16Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(QuantizationSmallShapes, + framework::dataset::make("DataType", DataType::F16)), + framework::dataset::make("DataTypeOut", { DataType::QASYMM16 })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) }))) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_u16); +} FIXTURE_DATA_TEST_CASE(RunLargeQASYMM8, NEQuantizationLayerQASYMM8Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(QuantizationLargeShapes, framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("DataTypeOut", { DataType::QASYMM8 })), @@ -138,9 +163,17 @@ FIXTURE_DATA_TEST_CASE(RunLargeQASYMM8, NEQuantizationLayerQASYMM8Fixture, // Validate output validate(Accessor(_target), _reference, tolerance_u8); } +FIXTURE_DATA_TEST_CASE(RunLargeQASYMM16, NEQuantizationLayerQASYMM16Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(QuantizationLargeShapes, + framework::dataset::make("DataType", DataType::F16)), + framework::dataset::make("DataTypeOut", { DataType::QASYMM16 })), + framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) }))) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_u16); +} TEST_SUITE_END() // FP16 -TEST_SUITE_END() // Half #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +TEST_SUITE_END() // Float TEST_SUITE_END() // QuantizationLayer TEST_SUITE_END() // NEON -- cgit v1.2.1