Requantization cases for offset changes only

Resolves: [COMPMID-6681] Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com> Change-Id: I325b9d478dd1d04a45533bb7708cf76e98ee0cee Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11058 Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com> 2024-01-30 18:25:51 +0000
committer: Gunes Bayir <gunes.bayir@arm.com> 2024-02-20 11:31:57 +0000
commit: 0a48c4c83b598991b4d4235f870c24d9e6634b20 (patch)
tree: 4d0117496c527fd952f435711e5c385023d7068e /src
parent: 946905847bf1d82b183e718fddfc7664702e5a84 (diff)
download: ComputeLibrary-0a48c4c83b598991b4d4235f870c24d9e6634b20.tar.gz
3 files changed, 186 insertions, 14 deletions
diff --git a/src/cpu/kernels/CpuQuantizeKernel.cpp b/src/cpu/kernels/CpuQuantizeKernel.cpp
index 5dde680837..d2ac6cf8ac 100644
--- a/src/cpu/kernels/CpuQuantizeKernel.cpp
+++ b/src/cpu/kernels/CpuQuantizeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -104,6 +104,18 @@ vector_type<int8_t> vquantize_qasymm8<int8_t>(const float32x4x4_t &qv, const Uni
     return vquantize_signed(qv, qi);
 }
 
+template <typename TOut, typename = typename std::enable_if<std::is_signed<TOut>::value, bool>::type>
+inline int8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper)
+{
+    return wrapper::vcombine(wrapper::vqmovn(lower), wrapper::vqmovn(upper));
+}
+
+template <typename TOut, typename = typename std::enable_if<std::is_unsigned<TOut>::value, bool>::type>
+inline uint8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper)
+{
+    return wrapper::vcombine(wrapper::vqmovun(lower), wrapper::vqmovun(upper));
+}
+
 } // namespace
 
 void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
@@ -120,6 +132,19 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
         {"op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t>},
         {"op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t>},
 
+        // Functions for offset only requantization
+        {"op_OFFSET_ONLY_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_requantize_offset_only<uint8_t, uint8_t>},
+        {"op_OFFSET_ONLY_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_requantize_offset_only<uint8_t, int8_t>},
+        {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_requantize_offset_only<int8_t, uint8_t>},
+        {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8_SIGNED",
+         &CpuQuantizeKernel::run_requantize_offset_only<int8_t, int8_t>},
+
+        // Functions for offset uint8 to int8 and vice versa quantization (no scale changes)
+        {"op_OFFSET_ONLY_CONVERT_QASYMM8_SIGNED_QASYMM8",
+         &CpuQuantizeKernel::run_requantize_offset_only_convert<int8_t, uint8_t>},
+        {"op_OFFSET_ONLY_CONVERT_QASYMM8_QASYMM8_SIGNED",
+         &CpuQuantizeKernel::run_requantize_offset_only_convert<uint8_t, int8_t>},
+
         {"op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8<float, int8_t>},
 
         {"op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t>},
@@ -134,6 +159,26 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
     };
 
     std::string function_to_call("op_");
+
+    // For offset only functions - must be 8-bit and have identical scale values.
+    if (src->quantization_info().scale() == dst->quantization_info().scale() &&
+        (is_data_type_quantized_asymmetric_char(src->data_type()) &&
+         is_data_type_quantized_asymmetric_char(dst->data_type())))
+    {
+        function_to_call += "OFFSET_ONLY_";
+        // For optimized datatype conversion 8-bit re-quantization offset only functions.
+        // These must have an offset of exactly 128 to match requirements - has specific circumstances to match use case.
+        auto uqinfo =
+            compute_requantization_scale_offset(src->quantization_info().uniform(), dst->quantization_info().uniform());
+        const auto src_dt = src->data_type();
+        if (src->data_type() != dst->data_type() && ((src_dt == DataType::QASYMM8_SIGNED && uqinfo.offset == 128) ||
+                                                     (src_dt == DataType::QASYMM8 && uqinfo.offset == -128)))
+        {
+            function_to_call += "CONVERT_";
+        }
+    }
+
+    // Specify datatype for function
     function_to_call += string_from_data_type(src->data_type()) + "_";
     function_to_call += string_from_data_type(dst->data_type());
 
@@ -145,9 +190,11 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
     }
     _func = it->second;
 
-    // Configure kernel window
-    Window win_config = calculate_max_window(*src, Steps());
-    ICpuKernel::configure(win_config);
+    // Calculate window. Squash if possible.
+    Window win;
+    std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src);
+
+    ICpuKernel::configure(win);
 }
 
 Status CpuQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
@@ -164,10 +211,8 @@ void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, co
 
     const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
     UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
-    {
-        uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
-    }
+    uqinfo                                  = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+
     // Collapse window and reset first dimension to handle tail calculations manually
     Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -195,6 +240,114 @@ void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, co
 }
 
 template <typename TIn, typename TOut>
+void CpuQuantizeKernel::run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window)
+{
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    // Calculate output offset difference.
+    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
+    uqinfo                                  = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Duplicate offset in signed vector format
+    const int8x16_t offset = wrapper::vdup_n(static_cast<int8_t>(uqinfo.offset), wrapper::traits::vector_128_tag{});
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
+            auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
+            int  x          = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                const wrapper::traits::neon_vector_t<TIn, window_step> qv =
+                    wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype
+
+                // Signed addition.
+                auto res = vaddq_s8(reinterpret_cast<int8x16_t>(qv), offset);
+
+                // Output is dependent on datatype.
+                wrapper::vstore(&output_ptr[x],
+                                reinterpret_cast<wrapper::traits::neon_vector_t<TOut, window_step>>(res));
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                auto result   = uqinfo.offset + static_cast<int32_t>(input_ptr[x]);
+                output_ptr[x] = static_cast<TOut>(result);
+            }
+        },
+        input, output);
+}
+
+template <typename TIn, typename TOut>
+void CpuQuantizeKernel::run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window)
+{
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
+    uqinfo                                  = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Duplicate offset in signed vector format
+    const int16x8_t offset = wrapper::vdup_n(static_cast<int16_t>(uqinfo.offset), wrapper::traits::vector_128_tag{});
+
+    const int32_t low_bound   = (dst->info()->data_type() == DataType::QASYMM8) ? 0 : -128;
+    const int32_t upper_bound = (dst->info()->data_type() == DataType::QASYMM8) ? 255 : 127;
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            auto  input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
+            TOut *output_ptr = reinterpret_cast<TOut *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                const auto qv    = wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype
+                int16x8_t  lower = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgetlow(qv)));
+                int16x8_t  upper = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgethigh(qv)));
+
+                // Signed addition.
+                lower = wrapper::vqadd(lower, offset);
+                upper = wrapper::vqadd(upper, offset);
+
+                // Output is dependent on datatype.
+                auto res = recombine_8_16<TOut>(lower, upper);
+                wrapper::vstore(&output_ptr[x], res);
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                // Add offset and clamp result to within the range of the output datatype.
+                int32_t result = uqinfo.offset + static_cast<int32_t>(input_ptr[x]);
+                result         = utility::clamp<int32_t>(result, low_bound, upper_bound);
+
+                // Cast result to output datatype.
+                output_ptr[x] = static_cast<TOut>(result);
+            }
+        },
+        input, output);
+}
+
+template <typename TIn, typename TOut>
 void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
 {
     const auto window_start_x = static_cast<int>(window.x().start());
@@ -302,6 +455,7 @@ const char *CpuQuantizeKernel::name() const
 {
     return "CpuQuantizeKernel";
 }
+
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/CpuQuantizeKernel.h b/src/cpu/kernels/CpuQuantizeKernel.h
index d6714136da..c2f7ac6d9d 100644
--- a/src/cpu/kernels/CpuQuantizeKernel.h
+++ b/src/cpu/kernels/CpuQuantizeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H
-#define ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H
+#ifndef ACL_SRC_CPU_KERNELS_CPUQUANTIZEKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUQUANTIZEKERNEL_H
 
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
@@ -58,6 +58,15 @@ public:
      */
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
+    /** Get the preferred dimension in which the scheduler splits the work into multiple jobs.
+    *
+    * @return The split dimension hint.
+    */
+    size_t get_split_dimension_hint() const
+    {
+        return _split_dimension;
+    }
+
     // Inherited methods overridden:
     void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
@@ -86,9 +95,17 @@ private:
     template <typename TIn, typename TOut>
     void run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window);
 
+    template <typename TIn, typename TOut>
+    void run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window);
+
+    template <typename TIn, typename TOut>
+    void run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window);
+
     QuantizeFunctionExecutorPtr _func{nullptr};
+    size_t                      _split_dimension{Window::DimY};
 };
+
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H */
+#endif // ACL_SRC_CPU_KERNELS_CPUQUANTIZEKERNEL_H
diff --git a/src/cpu/operators/CpuQuantize.cpp b/src/cpu/operators/CpuQuantize.cpp
index 4315499c39..4a3f1827c7 100644
--- a/src/cpu/operators/CpuQuantize.cpp
+++ b/src/cpu/operators/CpuQuantize.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,7 +55,8 @@ void CpuQuantize::configure(const ITensorInfo *src, ITensorInfo *dst)
 void CpuQuantize::run(ITensorPack &tensors)
 {
     ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+    auto split_dimension = static_cast<kernels::CpuQuantizeKernel *>(_kernel.get())->get_split_dimension_hint();
+    NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
 }
 } // namespace cpu
 } // namespace arm_compute
author	Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>	2024-01-30 18:25:51 +0000
committer	Gunes Bayir <gunes.bayir@arm.com>	2024-02-20 11:31:57 +0000
commit	0a48c4c83b598991b4d4235f870c24d9e6634b20 (patch)
tree	4d0117496c527fd952f435711e5c385023d7068e /src
parent	946905847bf1d82b183e718fddfc7664702e5a84 (diff)
download	ComputeLibrary-0a48c4c83b598991b4d4235f870c24d9e6634b20.tar.gz