aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/kernels/CpuQuantizeKernel.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/cpu/kernels/CpuQuantizeKernel.cpp')
-rw-r--r--src/cpu/kernels/CpuQuantizeKernel.cpp170
1 files changed, 162 insertions, 8 deletions
diff --git a/src/cpu/kernels/CpuQuantizeKernel.cpp b/src/cpu/kernels/CpuQuantizeKernel.cpp
index 5dde680837..d2ac6cf8ac 100644
--- a/src/cpu/kernels/CpuQuantizeKernel.cpp
+++ b/src/cpu/kernels/CpuQuantizeKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2022, 2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -104,6 +104,18 @@ vector_type<int8_t> vquantize_qasymm8<int8_t>(const float32x4x4_t &qv, const Uni
return vquantize_signed(qv, qi);
}
+template <typename TOut, typename = typename std::enable_if<std::is_signed<TOut>::value, bool>::type>
+inline int8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper)
+{
+ return wrapper::vcombine(wrapper::vqmovn(lower), wrapper::vqmovn(upper));
+}
+
+template <typename TOut, typename = typename std::enable_if<std::is_unsigned<TOut>::value, bool>::type>
+inline uint8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper)
+{
+ return wrapper::vcombine(wrapper::vqmovun(lower), wrapper::vqmovun(upper));
+}
+
} // namespace
void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
@@ -120,6 +132,19 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
{"op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t>},
{"op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t>},
+ // Functions for offset only requantization
+ {"op_OFFSET_ONLY_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_requantize_offset_only<uint8_t, uint8_t>},
+ {"op_OFFSET_ONLY_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_requantize_offset_only<uint8_t, int8_t>},
+ {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_requantize_offset_only<int8_t, uint8_t>},
+ {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8_SIGNED",
+ &CpuQuantizeKernel::run_requantize_offset_only<int8_t, int8_t>},
+
+ // Functions for offset uint8 to int8 and vice versa quantization (no scale changes)
+ {"op_OFFSET_ONLY_CONVERT_QASYMM8_SIGNED_QASYMM8",
+ &CpuQuantizeKernel::run_requantize_offset_only_convert<int8_t, uint8_t>},
+ {"op_OFFSET_ONLY_CONVERT_QASYMM8_QASYMM8_SIGNED",
+ &CpuQuantizeKernel::run_requantize_offset_only_convert<uint8_t, int8_t>},
+
{"op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8<float, int8_t>},
{"op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t>},
@@ -134,6 +159,26 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
};
std::string function_to_call("op_");
+
+ // For offset only functions - must be 8-bit and have identical scale values.
+ if (src->quantization_info().scale() == dst->quantization_info().scale() &&
+ (is_data_type_quantized_asymmetric_char(src->data_type()) &&
+ is_data_type_quantized_asymmetric_char(dst->data_type())))
+ {
+ function_to_call += "OFFSET_ONLY_";
+ // For optimized datatype conversion 8-bit re-quantization offset only functions.
+ // These must have an offset of exactly 128 to match requirements - has specific circumstances to match use case.
+ auto uqinfo =
+ compute_requantization_scale_offset(src->quantization_info().uniform(), dst->quantization_info().uniform());
+ const auto src_dt = src->data_type();
+ if (src->data_type() != dst->data_type() && ((src_dt == DataType::QASYMM8_SIGNED && uqinfo.offset == 128) ||
+ (src_dt == DataType::QASYMM8 && uqinfo.offset == -128)))
+ {
+ function_to_call += "CONVERT_";
+ }
+ }
+
+ // Specify datatype for function
function_to_call += string_from_data_type(src->data_type()) + "_";
function_to_call += string_from_data_type(dst->data_type());
@@ -145,9 +190,11 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
}
_func = it->second;
- // Configure kernel window
- Window win_config = calculate_max_window(*src, Steps());
- ICpuKernel::configure(win_config);
+ // Calculate window. Squash if possible.
+ Window win;
+ std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src);
+
+ ICpuKernel::configure(win);
}
Status CpuQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
@@ -164,10 +211,8 @@ void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, co
const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform();
- if (is_data_type_quantized_asymmetric(src->info()->data_type()))
- {
- uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
- }
+ uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+
// Collapse window and reset first dimension to handle tail calculations manually
Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -195,6 +240,114 @@ void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, co
}
template <typename TIn, typename TOut>
+void CpuQuantizeKernel::run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window)
+{
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ // Calculate output offset difference.
+ const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+ UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform();
+ uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+
+ // Collapse window and reset first dimension to handle tail calculations manually
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ // Duplicate offset in signed vector format
+ const int8x16_t offset = wrapper::vdup_n(static_cast<int8_t>(uqinfo.offset), wrapper::traits::vector_128_tag{});
+
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ auto input_ptr = reinterpret_cast<const TIn *>(input.ptr());
+ auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step); x += window_step)
+ {
+ const wrapper::traits::neon_vector_t<TIn, window_step> qv =
+ wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype
+
+ // Signed addition.
+ auto res = vaddq_s8(reinterpret_cast<int8x16_t>(qv), offset);
+
+ // Output is dependent on datatype.
+ wrapper::vstore(&output_ptr[x],
+ reinterpret_cast<wrapper::traits::neon_vector_t<TOut, window_step>>(res));
+ }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ auto result = uqinfo.offset + static_cast<int32_t>(input_ptr[x]);
+ output_ptr[x] = static_cast<TOut>(result);
+ }
+ },
+ input, output);
+}
+
+template <typename TIn, typename TOut>
+void CpuQuantizeKernel::run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window)
+{
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+ UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform();
+ uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+
+ // Collapse window and reset first dimension to handle tail calculations manually
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ // Duplicate offset in signed vector format
+ const int16x8_t offset = wrapper::vdup_n(static_cast<int16_t>(uqinfo.offset), wrapper::traits::vector_128_tag{});
+
+ const int32_t low_bound = (dst->info()->data_type() == DataType::QASYMM8) ? 0 : -128;
+ const int32_t upper_bound = (dst->info()->data_type() == DataType::QASYMM8) ? 255 : 127;
+
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ auto input_ptr = reinterpret_cast<const TIn *>(input.ptr());
+ TOut *output_ptr = reinterpret_cast<TOut *>(output.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step); x += window_step)
+ {
+ const auto qv = wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype
+ int16x8_t lower = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgetlow(qv)));
+ int16x8_t upper = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgethigh(qv)));
+
+ // Signed addition.
+ lower = wrapper::vqadd(lower, offset);
+ upper = wrapper::vqadd(upper, offset);
+
+ // Output is dependent on datatype.
+ auto res = recombine_8_16<TOut>(lower, upper);
+ wrapper::vstore(&output_ptr[x], res);
+ }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ // Add offset and clamp result to within the range of the output datatype.
+ int32_t result = uqinfo.offset + static_cast<int32_t>(input_ptr[x]);
+ result = utility::clamp<int32_t>(result, low_bound, upper_bound);
+
+ // Cast result to output datatype.
+ output_ptr[x] = static_cast<TOut>(result);
+ }
+ },
+ input, output);
+}
+
+template <typename TIn, typename TOut>
void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
{
const auto window_start_x = static_cast<int>(window.x().start());
@@ -302,6 +455,7 @@ const char *CpuQuantizeKernel::name() const
{
return "CpuQuantizeKernel";
}
+
} // namespace kernels
} // namespace cpu
} // namespace arm_compute