aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>2024-01-30 18:25:51 +0000
committerGunes Bayir <gunes.bayir@arm.com>2024-02-20 11:31:57 +0000
commit0a48c4c83b598991b4d4235f870c24d9e6634b20 (patch)
tree4d0117496c527fd952f435711e5c385023d7068e /src
parent946905847bf1d82b183e718fddfc7664702e5a84 (diff)
downloadComputeLibrary-0a48c4c83b598991b4d4235f870c24d9e6634b20.tar.gz
Requantization cases for offset changes only
Resolves: [COMPMID-6681] Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com> Change-Id: I325b9d478dd1d04a45533bb7708cf76e98ee0cee Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11058 Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/cpu/kernels/CpuQuantizeKernel.cpp170
-rw-r--r--src/cpu/kernels/CpuQuantizeKernel.h25
-rw-r--r--src/cpu/operators/CpuQuantize.cpp5
3 files changed, 186 insertions, 14 deletions
diff --git a/src/cpu/kernels/CpuQuantizeKernel.cpp b/src/cpu/kernels/CpuQuantizeKernel.cpp
index 5dde680837..d2ac6cf8ac 100644
--- a/src/cpu/kernels/CpuQuantizeKernel.cpp
+++ b/src/cpu/kernels/CpuQuantizeKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2022, 2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -104,6 +104,18 @@ vector_type<int8_t> vquantize_qasymm8<int8_t>(const float32x4x4_t &qv, const Uni
return vquantize_signed(qv, qi);
}
+template <typename TOut, typename = typename std::enable_if<std::is_signed<TOut>::value, bool>::type>
+inline int8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper)
+{
+ return wrapper::vcombine(wrapper::vqmovn(lower), wrapper::vqmovn(upper));
+}
+
+template <typename TOut, typename = typename std::enable_if<std::is_unsigned<TOut>::value, bool>::type>
+inline uint8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper)
+{
+ return wrapper::vcombine(wrapper::vqmovun(lower), wrapper::vqmovun(upper));
+}
+
} // namespace
void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
@@ -120,6 +132,19 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
{"op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t>},
{"op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t>},
+ // Functions for offset only requantization
+ {"op_OFFSET_ONLY_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_requantize_offset_only<uint8_t, uint8_t>},
+ {"op_OFFSET_ONLY_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_requantize_offset_only<uint8_t, int8_t>},
+ {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_requantize_offset_only<int8_t, uint8_t>},
+ {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8_SIGNED",
+ &CpuQuantizeKernel::run_requantize_offset_only<int8_t, int8_t>},
+
+ // Functions for offset uint8 to int8 and vice versa quantization (no scale changes)
+ {"op_OFFSET_ONLY_CONVERT_QASYMM8_SIGNED_QASYMM8",
+ &CpuQuantizeKernel::run_requantize_offset_only_convert<int8_t, uint8_t>},
+ {"op_OFFSET_ONLY_CONVERT_QASYMM8_QASYMM8_SIGNED",
+ &CpuQuantizeKernel::run_requantize_offset_only_convert<uint8_t, int8_t>},
+
{"op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8<float, int8_t>},
{"op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t>},
@@ -134,6 +159,26 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
};
std::string function_to_call("op_");
+
+ // For offset only functions - must be 8-bit and have identical scale values.
+ if (src->quantization_info().scale() == dst->quantization_info().scale() &&
+ (is_data_type_quantized_asymmetric_char(src->data_type()) &&
+ is_data_type_quantized_asymmetric_char(dst->data_type())))
+ {
+ function_to_call += "OFFSET_ONLY_";
+ // For optimized datatype conversion 8-bit re-quantization offset only functions.
+ // These must have an offset of exactly 128 to match requirements - has specific circumstances to match use case.
+ auto uqinfo =
+ compute_requantization_scale_offset(src->quantization_info().uniform(), dst->quantization_info().uniform());
+ const auto src_dt = src->data_type();
+ if (src->data_type() != dst->data_type() && ((src_dt == DataType::QASYMM8_SIGNED && uqinfo.offset == 128) ||
+ (src_dt == DataType::QASYMM8 && uqinfo.offset == -128)))
+ {
+ function_to_call += "CONVERT_";
+ }
+ }
+
+ // Specify datatype for function
function_to_call += string_from_data_type(src->data_type()) + "_";
function_to_call += string_from_data_type(dst->data_type());
@@ -145,9 +190,11 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
}
_func = it->second;
- // Configure kernel window
- Window win_config = calculate_max_window(*src, Steps());
- ICpuKernel::configure(win_config);
+ // Calculate window. Squash if possible.
+ Window win;
+ std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src);
+
+ ICpuKernel::configure(win);
}
Status CpuQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
@@ -164,10 +211,8 @@ void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, co
const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform();
- if (is_data_type_quantized_asymmetric(src->info()->data_type()))
- {
- uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
- }
+ uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+
// Collapse window and reset first dimension to handle tail calculations manually
Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -195,6 +240,114 @@ void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, co
}
template <typename TIn, typename TOut>
+void CpuQuantizeKernel::run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window)
+{
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ // Calculate output offset difference.
+ const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+ UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform();
+ uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+
+ // Collapse window and reset first dimension to handle tail calculations manually
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ // Duplicate offset in signed vector format
+ const int8x16_t offset = wrapper::vdup_n(static_cast<int8_t>(uqinfo.offset), wrapper::traits::vector_128_tag{});
+
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ auto input_ptr = reinterpret_cast<const TIn *>(input.ptr());
+ auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step); x += window_step)
+ {
+ const wrapper::traits::neon_vector_t<TIn, window_step> qv =
+ wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype
+
+ // Signed addition.
+ auto res = vaddq_s8(reinterpret_cast<int8x16_t>(qv), offset);
+
+ // Output is dependent on datatype.
+ wrapper::vstore(&output_ptr[x],
+ reinterpret_cast<wrapper::traits::neon_vector_t<TOut, window_step>>(res));
+ }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ auto result = uqinfo.offset + static_cast<int32_t>(input_ptr[x]);
+ output_ptr[x] = static_cast<TOut>(result);
+ }
+ },
+ input, output);
+}
+
+template <typename TIn, typename TOut>
+void CpuQuantizeKernel::run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window)
+{
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+ UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform();
+ uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+
+ // Collapse window and reset first dimension to handle tail calculations manually
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ // Duplicate offset in signed vector format
+ const int16x8_t offset = wrapper::vdup_n(static_cast<int16_t>(uqinfo.offset), wrapper::traits::vector_128_tag{});
+
+ const int32_t low_bound = (dst->info()->data_type() == DataType::QASYMM8) ? 0 : -128;
+ const int32_t upper_bound = (dst->info()->data_type() == DataType::QASYMM8) ? 255 : 127;
+
+ Iterator input(src, win_collapsed);
+ Iterator output(dst, win_collapsed);
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
+ {
+ auto input_ptr = reinterpret_cast<const TIn *>(input.ptr());
+ TOut *output_ptr = reinterpret_cast<TOut *>(output.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step); x += window_step)
+ {
+ const auto qv = wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype
+ int16x8_t lower = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgetlow(qv)));
+ int16x8_t upper = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgethigh(qv)));
+
+ // Signed addition.
+ lower = wrapper::vqadd(lower, offset);
+ upper = wrapper::vqadd(upper, offset);
+
+ // Output is dependent on datatype.
+ auto res = recombine_8_16<TOut>(lower, upper);
+ wrapper::vstore(&output_ptr[x], res);
+ }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ // Add offset and clamp result to within the range of the output datatype.
+ int32_t result = uqinfo.offset + static_cast<int32_t>(input_ptr[x]);
+ result = utility::clamp<int32_t>(result, low_bound, upper_bound);
+
+ // Cast result to output datatype.
+ output_ptr[x] = static_cast<TOut>(result);
+ }
+ },
+ input, output);
+}
+
+template <typename TIn, typename TOut>
void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
{
const auto window_start_x = static_cast<int>(window.x().start());
@@ -302,6 +455,7 @@ const char *CpuQuantizeKernel::name() const
{
return "CpuQuantizeKernel";
}
+
} // namespace kernels
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuQuantizeKernel.h b/src/cpu/kernels/CpuQuantizeKernel.h
index d6714136da..c2f7ac6d9d 100644
--- a/src/cpu/kernels/CpuQuantizeKernel.h
+++ b/src/cpu/kernels/CpuQuantizeKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2022, 2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H
-#define ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H
+#ifndef ACL_SRC_CPU_KERNELS_CPUQUANTIZEKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUQUANTIZEKERNEL_H
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuKernel.h"
@@ -58,6 +58,15 @@ public:
*/
static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+ /** Get the preferred dimension in which the scheduler splits the work into multiple jobs.
+ *
+ * @return The split dimension hint.
+ */
+ size_t get_split_dimension_hint() const
+ {
+ return _split_dimension;
+ }
+
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
@@ -86,9 +95,17 @@ private:
template <typename TIn, typename TOut>
void run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window);
+ template <typename TIn, typename TOut>
+ void run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window);
+
+ template <typename TIn, typename TOut>
+ void run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window);
+
QuantizeFunctionExecutorPtr _func{nullptr};
+ size_t _split_dimension{Window::DimY};
};
+
} // namespace kernels
} // namespace cpu
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H */
+#endif // ACL_SRC_CPU_KERNELS_CPUQUANTIZEKERNEL_H
diff --git a/src/cpu/operators/CpuQuantize.cpp b/src/cpu/operators/CpuQuantize.cpp
index 4315499c39..4a3f1827c7 100644
--- a/src/cpu/operators/CpuQuantize.cpp
+++ b/src/cpu/operators/CpuQuantize.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -55,7 +55,8 @@ void CpuQuantize::configure(const ITensorInfo *src, ITensorInfo *dst)
void CpuQuantize::run(ITensorPack &tensors)
{
ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
- NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+ auto split_dimension = static_cast<kernels::CpuQuantizeKernel *>(_kernel.get())->get_split_dimension_hint();
+ NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
}
} // namespace cpu
} // namespace arm_compute