From 3d13af8a39f408318328a95d5329bc17fd923438 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Tue, 4 Jun 2019 13:04:16 +0100
Subject: COMPMID-2235: Extend type support for CL/NEON DequantizationLayer.

Adds support for:
- QSYMM8

Change-Id: Ia0b839fc844ce0f968dad1b69a001f9a660dbcd5
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1378
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
---
 src/core/CL/CLHelpers.cpp                          | 10 +--
 src/core/CL/cl_kernels/dequantization_layer.cl     | 15 +++--
 .../CL/kernels/CLDequantizationLayerKernel.cpp     | 12 ++--
 .../NEON/kernels/NEDequantizationLayerKernel.cpp   | 74 ++++++++++++++++++++--
 src/core/Utils.cpp                                 |  2 +
 src/runtime/CL/CLTensorAllocator.cpp               |  9 +--
 6 files changed, 96 insertions(+), 26 deletions(-)

(limited to 'src')

diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index f4ceca8200..2e6ceb4433 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -37,11 +37,11 @@ std::string get_cl_type_from_data_type(const DataType &dt)
     switch(dt)
     {
         case DataType::U8:
+        case DataType::QASYMM8:
             return "uchar";
         case DataType::S8:
+        case DataType::QSYMM8:
             return "char";
-        case DataType::QASYMM8:
-            return "uchar";
         case DataType::U16:
             return "ushort";
         case DataType::S16:
@@ -69,11 +69,11 @@ std::string get_cl_select_type_from_data_type(const DataType &dt)
     switch(dt)
     {
         case DataType::U8:
+        case DataType::QASYMM8:
             return "uchar";
         case DataType::S8:
+        case DataType::QSYMM8:
             return "char";
-        case DataType::QASYMM8:
-            return "uchar";
         case DataType::U16:
             return "ushort";
         case DataType::F16:
@@ -100,6 +100,7 @@ std::string get_data_size_from_data_type(const DataType &dt)
     {
         case DataType::U8:
         case DataType::S8:
+        case DataType::QSYMM8:
         case DataType::QASYMM8:
             return "8";
         case DataType::U16:
@@ -241,6 +242,7 @@ size_t preferred_vector_width(const cl::Device &device, const DataType dt)
         case DataType::U8:
         case DataType::S8:
         case DataType::QASYMM8:
+        case DataType::QSYMM8:
             return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR>();
         case DataType::U16:
         case DataType::S16:
diff --git a/src/core/CL/cl_kernels/dequantization_layer.cl b/src/core/CL/cl_kernels/dequantization_layer.cl
index 7307700473..ad3ed35480 100644
--- a/src/core/CL/cl_kernels/dequantization_layer.cl
+++ b/src/core/CL/cl_kernels/dequantization_layer.cl
@@ -23,16 +23,17 @@
  */
 #include "helpers.h"
 
-#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(SCALE) && defined(OFFSET)
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
 
 /** This performs the dequantization of 8-bit unsigned integers to floating point.
  *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
  * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
  * @note Quantization scale of input tensor is passed in with -DSCALE=scale.
  * @note Quantization offset of input tensor is passed in with -DOFFSET=offset.
  *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QSYMM8
  * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -66,7 +67,7 @@ __kernel void dequantization_layer(
 
     // Load data
     VEC_DATA_TYPE(int, VEC_SIZE)
-    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
 
     // Create scale and offset vectors
     const VEC_DATA_TYPE(float, VEC_SIZE)
@@ -81,10 +82,10 @@ __kernel void dequantization_layer(
 
     // Store result
     VSTORE(VEC_SIZE)
-    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr);
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
 #else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
-    *((__global DATA_TYPE *)(output.ptr)) = (DATA_TYPE)((float)((int)(*((__global uchar *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE));
+    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE));
 #endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
 }
 
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(SCALE) && defined(OFFSET)
\ No newline at end of file
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
index 0b066837a9..e383bc475d 100644
--- a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
@@ -40,7 +40,7 @@ namespace
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8);
 
     if(output->tensor_shape().total_size() > 0)
     {
@@ -95,15 +95,19 @@ void CLDequantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *o
     }
     ICLKernel::configure_internal(win);
 
-    const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qinfo   = input->info()->quantization_info().uniform();
+    const int                     qoffset = is_data_type_quantized_asymmetric(input->info()->data_type()) ? qinfo.offset : 0;
 
     // Create kernel
     CLBuildOptions build_opts;
     build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
-    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset));
+    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qoffset));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE_SRC=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE_DST=" + get_cl_type_from_data_type(output->info()->data_type()));
     build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+
+    // Create kernel name
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("dequantization_layer", build_opts.options()));
 }
 
diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
index a6dc0977d2..bf0a2ca7bf 100644
--- a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
@@ -42,7 +42,7 @@ namespace
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8);
 
     if(output->tensor_shape().total_size() > 0)
     {
@@ -95,9 +95,11 @@ inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v)
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 template <typename T>
-void run_dequantization(const ITensor *input, ITensor *output, const Window &window)
+void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Window &window)
 {
-    const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
+    const UniformQuantizationInfo &qinfo  = input->info()->quantization_info().uniform();
+    const float                    scale  = qinfo.scale;
+    const int32_t                  offset = qinfo.offset;
 
     const int  window_step_x  = 16;
     const auto window_start_x = static_cast<int>(window.x().start());
@@ -120,7 +122,49 @@ void run_dequantization(const ITensor *input, ITensor *output, const Window &win
         for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
             const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, qinfo);
+            const auto vdeq = vdequantize(vin, scale, offset);
+
+            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            uint8_t val    = *(in_ptr + x);
+            *(out_ptr + x) = static_cast<T>(dequantize(val, scale, offset));
+        }
+    },
+    in, out);
+}
+
+template <typename T>
+void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Window &window)
+{
+    const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
+    const float                    scale = qinfo.scale;
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Create iterators
+    Iterator in(input, win_collapsed);
+    Iterator out(output, win_collapsed);
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
+        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin  = wrapper::vloadq(in_ptr + x);
+            const auto vdeq = vdequantize(vin, scale);
 
             store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
         }
@@ -129,11 +173,27 @@ void run_dequantization(const ITensor *input, ITensor *output, const Window &win
         for(; x < window_end_x; ++x)
         {
             uint8_t val    = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize_qasymm8(val, qinfo));
+            *(out_ptr + x) = static_cast<T>(dequantize(val, scale));
         }
     },
     in, out);
 }
+
+template <typename T>
+void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window)
+{
+    switch(input->info()->data_type())
+    {
+        case DataType::QASYMM8:
+            run_dequantization_qasymm8<T>(input, output, window);
+            break;
+        case DataType::QSYMM8:
+            run_dequantization_qsymm8<T>(input, output, window);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
+}
 } // namespace
 
 NEDequantizationLayerKernel::NEDequantizationLayerKernel()
@@ -173,11 +233,11 @@ void NEDequantizationLayerKernel::run(const Window &window, const ThreadInfo &in
     switch(_output->info()->data_type())
     {
         case DataType::F32:
-            run_dequantization<float>(_input, _output, window);
+            run_dequantization_core<float>(_input, _output, window);
             break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            run_dequantization<float16_t>(_input, _output, window);
+            run_dequantization_core<float16_t>(_input, _output, window);
             break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         default:
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 22be0002ee..499a6c8b29 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -158,6 +158,8 @@ const std::string &arm_compute::string_from_data_type(DataType dt)
         { DataType::F32, "F32" },
         { DataType::F64, "F64" },
         { DataType::SIZET, "SIZET" },
+        { DataType::QSYMM8, "QSYMM8" },
+        { DataType::QSYMM8_PER_CHANNEL, "QSYMM8_PER_CHANNEL" },
         { DataType::QASYMM8, "QASYMM8" },
         { DataType::QSYMM16, "QSYMM16" },
     };
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index 63aa1ba9ea..f3f16cd8c0 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -87,11 +87,12 @@ void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const
     clear_quantization_arrays(scale, offset);
 
     // Create scale array
-    const size_t num_elements = qinfo.scale.size();
-    const size_t element_size = sizeof(decltype(qinfo.scale)::value_type);
-    scale                     = CLFloatArray(num_elements + pad_size);
+    const std::vector<float> &qscale       = qinfo.scale();
+    const size_t              num_elements = qscale.size();
+    const size_t              element_size = sizeof(std::remove_reference<decltype(qscale)>::type::value_type);
+    scale                                  = CLFloatArray(num_elements + pad_size);
     scale.resize(num_elements);
-    CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, qinfo.scale.data());
+    CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, qinfo.scale().data());
 }
 } // namespace
 
-- 
cgit v1.2.1