diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/cpu/kernels/CpuQuantizeKernel.cpp | 170 | ||||
-rw-r--r-- | src/cpu/kernels/CpuQuantizeKernel.h | 25 | ||||
-rw-r--r-- | src/cpu/operators/CpuQuantize.cpp | 5 |
3 files changed, 186 insertions, 14 deletions
diff --git a/src/cpu/kernels/CpuQuantizeKernel.cpp b/src/cpu/kernels/CpuQuantizeKernel.cpp index 5dde680837..d2ac6cf8ac 100644 --- a/src/cpu/kernels/CpuQuantizeKernel.cpp +++ b/src/cpu/kernels/CpuQuantizeKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -104,6 +104,18 @@ vector_type<int8_t> vquantize_qasymm8<int8_t>(const float32x4x4_t &qv, const Uni return vquantize_signed(qv, qi); } +template <typename TOut, typename = typename std::enable_if<std::is_signed<TOut>::value, bool>::type> +inline int8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper) +{ + return wrapper::vcombine(wrapper::vqmovn(lower), wrapper::vqmovn(upper)); +} + +template <typename TOut, typename = typename std::enable_if<std::is_unsigned<TOut>::value, bool>::type> +inline uint8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper) +{ + return wrapper::vcombine(wrapper::vqmovun(lower), wrapper::vqmovun(upper)); +} + } // namespace void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) @@ -120,6 +132,19 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) {"op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t>}, {"op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t>}, + // Functions for offset only requantization + {"op_OFFSET_ONLY_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_requantize_offset_only<uint8_t, uint8_t>}, + {"op_OFFSET_ONLY_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_requantize_offset_only<uint8_t, int8_t>}, + {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_requantize_offset_only<int8_t, uint8_t>}, + {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8_SIGNED", + &CpuQuantizeKernel::run_requantize_offset_only<int8_t, int8_t>}, + + // Functions for offset uint8 to int8 and vice versa quantization (no scale changes) + {"op_OFFSET_ONLY_CONVERT_QASYMM8_SIGNED_QASYMM8", + &CpuQuantizeKernel::run_requantize_offset_only_convert<int8_t, uint8_t>}, + {"op_OFFSET_ONLY_CONVERT_QASYMM8_QASYMM8_SIGNED", + &CpuQuantizeKernel::run_requantize_offset_only_convert<uint8_t, int8_t>}, + {"op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8<float, int8_t>}, {"op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t>}, @@ -134,6 +159,26 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) }; std::string function_to_call("op_"); + + // For offset only functions - must be 8-bit and have identical scale values. + if (src->quantization_info().scale() == dst->quantization_info().scale() && + (is_data_type_quantized_asymmetric_char(src->data_type()) && + is_data_type_quantized_asymmetric_char(dst->data_type()))) + { + function_to_call += "OFFSET_ONLY_"; + // For optimized datatype conversion 8-bit re-quantization offset only functions. + // These must have an offset of exactly 128 to match requirements - has specific circumstances to match use case. + auto uqinfo = + compute_requantization_scale_offset(src->quantization_info().uniform(), dst->quantization_info().uniform()); + const auto src_dt = src->data_type(); + if (src->data_type() != dst->data_type() && ((src_dt == DataType::QASYMM8_SIGNED && uqinfo.offset == 128) || + (src_dt == DataType::QASYMM8 && uqinfo.offset == -128))) + { + function_to_call += "CONVERT_"; + } + } + + // Specify datatype for function function_to_call += string_from_data_type(src->data_type()) + "_"; function_to_call += string_from_data_type(dst->data_type()); @@ -145,9 +190,11 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) } _func = it->second; - // Configure kernel window - Window win_config = calculate_max_window(*src, Steps()); - ICpuKernel::configure(win_config); + // Calculate window. Squash if possible. + Window win; + std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src); + + ICpuKernel::configure(win); } Status CpuQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) @@ -164,10 +211,8 @@ void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, co const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); - if (is_data_type_quantized_asymmetric(src->info()->data_type())) - { - uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); - } + uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + // Collapse window and reset first dimension to handle tail calculations manually Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); @@ -195,6 +240,114 @@ void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, co } template <typename TIn, typename TOut> +void CpuQuantizeKernel::run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Calculate output offset difference. + const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); + UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); + uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Duplicate offset in signed vector format + const int8x16_t offset = wrapper::vdup_n(static_cast<int8_t>(uqinfo.offset), wrapper::traits::vector_128_tag{}); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); + auto output_ptr = reinterpret_cast<TOut *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + const wrapper::traits::neon_vector_t<TIn, window_step> qv = + wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype + + // Signed addition. + auto res = vaddq_s8(reinterpret_cast<int8x16_t>(qv), offset); + + // Output is dependent on datatype. + wrapper::vstore(&output_ptr[x], + reinterpret_cast<wrapper::traits::neon_vector_t<TOut, window_step>>(res)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + auto result = uqinfo.offset + static_cast<int32_t>(input_ptr[x]); + output_ptr[x] = static_cast<TOut>(result); + } + }, + input, output); +} + +template <typename TIn, typename TOut> +void CpuQuantizeKernel::run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); + UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); + uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Duplicate offset in signed vector format + const int16x8_t offset = wrapper::vdup_n(static_cast<int16_t>(uqinfo.offset), wrapper::traits::vector_128_tag{}); + + const int32_t low_bound = (dst->info()->data_type() == DataType::QASYMM8) ? 0 : -128; + const int32_t upper_bound = (dst->info()->data_type() == DataType::QASYMM8) ? 255 : 127; + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); + TOut *output_ptr = reinterpret_cast<TOut *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + const auto qv = wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype + int16x8_t lower = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgetlow(qv))); + int16x8_t upper = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgethigh(qv))); + + // Signed addition. + lower = wrapper::vqadd(lower, offset); + upper = wrapper::vqadd(upper, offset); + + // Output is dependent on datatype. + auto res = recombine_8_16<TOut>(lower, upper); + wrapper::vstore(&output_ptr[x], res); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + // Add offset and clamp result to within the range of the output datatype. + int32_t result = uqinfo.offset + static_cast<int32_t>(input_ptr[x]); + result = utility::clamp<int32_t>(result, low_bound, upper_bound); + + // Cast result to output datatype. + output_ptr[x] = static_cast<TOut>(result); + } + }, + input, output); +} + +template <typename TIn, typename TOut> void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) { const auto window_start_x = static_cast<int>(window.x().start()); @@ -302,6 +455,7 @@ const char *CpuQuantizeKernel::name() const { return "CpuQuantizeKernel"; } + } // namespace kernels } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/CpuQuantizeKernel.h b/src/cpu/kernels/CpuQuantizeKernel.h index d6714136da..c2f7ac6d9d 100644 --- a/src/cpu/kernels/CpuQuantizeKernel.h +++ b/src/cpu/kernels/CpuQuantizeKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H -#define ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H +#ifndef ACL_SRC_CPU_KERNELS_CPUQUANTIZEKERNEL_H +#define ACL_SRC_CPU_KERNELS_CPUQUANTIZEKERNEL_H #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -58,6 +58,15 @@ public: */ static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + /** Get the preferred dimension in which the scheduler splits the work into multiple jobs. + * + * @return The split dimension hint. + */ + size_t get_split_dimension_hint() const + { + return _split_dimension; + } + // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; @@ -86,9 +95,17 @@ private: template <typename TIn, typename TOut> void run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window); + template <typename TIn, typename TOut> + void run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window); + + template <typename TIn, typename TOut> + void run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window); + QuantizeFunctionExecutorPtr _func{nullptr}; + size_t _split_dimension{Window::DimY}; }; + } // namespace kernels } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H */ +#endif // ACL_SRC_CPU_KERNELS_CPUQUANTIZEKERNEL_H diff --git a/src/cpu/operators/CpuQuantize.cpp b/src/cpu/operators/CpuQuantize.cpp index 4315499c39..4a3f1827c7 100644 --- a/src/cpu/operators/CpuQuantize.cpp +++ b/src/cpu/operators/CpuQuantize.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -55,7 +55,8 @@ void CpuQuantize::configure(const ITensorInfo *src, ITensorInfo *dst) void CpuQuantize::run(ITensorPack &tensors) { ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); + auto split_dimension = static_cast<kernels::CpuQuantizeKernel *>(_kernel.get())->get_split_dimension_hint(); + NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); } } // namespace cpu } // namespace arm_compute |