Optimize CL Scale/Resize Quantized by removing (de)quant. code

This patch removes the quant/dequant code in CLScale and the Resize operator in dynamic fusion. We don't support different quantization information for input and output and in this case the quantization and dequantization is not necessary. The very same optimization was delivered for cpu. It also moves the SCALE_X and SCALE_Y arguments to look-up table from build options in the template writer of Resize. Change-Id: Icd043c8671220c8feea935dd4b24a5b17c6c4ea4 Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8888 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Gunes Bayir <gunes.bayir@arm.com> 2022-12-26 16:24:04 +0000
committer: Gunes Bayir <gunes.bayir@arm.com> 2022-12-29 17:27:33 +0000
commit: b7e8626717b2ef81b0d03284c8f6ffdbe9cd2245 (patch)
tree: 8fb8197a4f69fb5024a5330e031d3d5c0b26bdad
parent: a5cb79f18685292bf5b63a0c484a58945320823d (diff)
download: ComputeLibrary-b7e8626717b2ef81b0d03284c8f6ffdbe9cd2245.tar.gz
3 files changed, 27 insertions, 89 deletions
diff --git a/src/core/CL/cl_kernels/nhwc/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl
index bccfd6543a..f6a3e0971b 100644
--- a/src/core/CL/cl_kernels/nhwc/scale.cl
+++ b/src/core/CL/cl_kernels/nhwc/scale.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -218,34 +218,17 @@ __kernel void scale_bilinear_nhwc(
     // Calculate the output
     out[0].v = ((in00[0].v * b * b1) + (in01[0].v * a * b1) + (in10[0].v * b * a1) + (in11[0].v * a * a1));
 #else  // defined(IS_FLOATING_POINT)
-    TILE(float, 1, N0, out_f);
-    TILE(float, 1, N0, in00_f);
-    TILE(float, 1, N0, in01_f);
-    TILE(float, 1, N0, in10_f);
-    TILE(float, 1, N0, in11_f);
 
     const float a  = (xi_f - (float)xi);
     const float b  = (1.f - a);
     const float a1 = (yi_f - (float)yi);
     const float b1 = (1.f - a1);
 
-    // Dequantize
-    LOOP_UNROLLING(int, n0, 0, 1, N0,
-    {
-        in00_f[0].s[n0] = ((float)in00[0].s[n0] - (float)OFFSET) * (float)SCALE;
-        in01_f[0].s[n0] = ((float)in01[0].s[n0] - (float)OFFSET) * (float)SCALE;
-        in10_f[0].s[n0] = ((float)in10[0].s[n0] - (float)OFFSET) * (float)SCALE;
-        in11_f[0].s[n0] = ((float)in11[0].s[n0] - (float)OFFSET) * (float)SCALE;
-    })
-
-    // Calculate the output in the floating-point domain
-    out_f[0].v = ((in00_f[0].v * b * b1) + (in01_f[0].v * a * b1) + (in10_f[0].v * b * a1) + (in11_f[0].v * a * a1));
-
-    // Quantize
-    LOOP_UNROLLING(int, n0, 0, 1, N0,
-    {
-        out[0].s[n0] = CONVERT_SAT(out_f[0].s[n0] / (float)SCALE + (float)OFFSET, DST_DATA_TYPE);
-    })
+    out[0].v = CONVERT_SAT((CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) + 
+                   (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) + 
+                   (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) + 
+                   (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1), 
+                VEC_DATA_TYPE(DST_DATA_TYPE, N0));
 #endif // defined(IS_FLOATING_POINT)
 
     TILE(uint, 1, 1, dst_indirect_y);
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
index 7ee79e82af..a2c04d94e5 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
@@ -66,15 +66,15 @@ TILE(uint, 1, 1, g_dst_indirect_y);
         if(_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT)
         {
             code += R"_(
-    float xi_f = (g_ind_1 * SCALE_X);
-    float yi_f = (yo * SCALE_Y);
+    float xi_f = (g_ind_1 * {{SCALE_X}});
+    float yi_f = (yo * {{SCALE_Y}});
 )_";
         }
         else
         {
             code += R"_(
-    float xi_f = ((g_ind_1 + 0.5f) * SCALE_X);
-    float yi_f = ((yo + 0.5f) * SCALE_Y);
+    float xi_f = ((g_ind_1 + 0.5f) * {{SCALE_X}});
+    float yi_f = ((yo + 0.5f) * {{SCALE_Y}});
 )_";
         }
 
@@ -98,15 +98,15 @@ TILE(uint, 1, 1, g_dst_indirect_y);
         if(_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT)
         {
             code += R"_(
-    float xi_f = (g_ind_1 * SCALE_X);
-    float yi_f = (yo * SCALE_Y);
+    float xi_f = (g_ind_1 * {{SCALE_X}});
+    float yi_f = (yo * {{SCALE_Y}});
 )_";
         }
         else
         {
             code += R"_(
-    float xi_f = ((g_ind_1 + 0.5f) * SCALE_X - 0.5f);
-    float yi_f = ((yo + 0.5f) * SCALE_Y - 0.5f);
+    float xi_f = ((g_ind_1 + 0.5f) * {{SCALE_X}} - 0.5f);
+    float yi_f = ((yo + 0.5f) * {{SCALE_Y}} - 0.5f);
 )_";
         }
 
@@ -150,37 +150,16 @@ TILE(uint, 1, 1, g_dst_indirect_y);
         else
         {
             code += R"_(
-    TILE(float, 1, N0, out_f);
-    TILE(float, 1, N0, in00_f);
-    TILE(float, 1, N0, in01_f);
-    TILE(float, 1, N0, in10_f);
-    TILE(float, 1, N0, in11_f);
-
     const float a  = (xi_f - (float)xi);
     const float b  = (1.f - a);
     const float a1 = (yi_f - (float)yi);
     const float b1 = (1.f - a1);
-)_"
-                    // Dequantize
-                    R"_(
-    LOOP_UNROLLING(int, n0, 0, 1, N0,
-    {
-        in00_f[0].s[n0] = ((float)in00[0].s[n0] - (float){{OFFSET}}) * (float){{SCALE}};
-        in01_f[0].s[n0] = ((float)in01[0].s[n0] - (float){{OFFSET}}) * (float){{SCALE}};
-        in10_f[0].s[n0] = ((float)in10[0].s[n0] - (float){{OFFSET}}) * (float){{SCALE}};
-        in11_f[0].s[n0] = ((float)in11[0].s[n0] - (float){{OFFSET}}) * (float){{SCALE}};
-    })
-)_"
-                    // Calculate the output in the floating-point domain
-                    R"_(
-    out_f[0].v = ((in00_f[0].v * b * b1) + (in01_f[0].v * a * b1) + (in10_f[0].v * b * a1) + (in11_f[0].v * a * a1));
-)_"
-                    // Quantize
-                    R"_(
-    LOOP_UNROLLING(int, n0, 0, 1, N0,
-    {
-        {{dst}}[0].s[n0] = CONVERT_SAT(out_f[0].s[n0] / (float){{SCALE}} + (float){{OFFSET}}, {{DST_DATA_TYPE}});
-    })
+
+    {{dst}}[0].v = CONVERT_SAT(
+        (CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) + 
+        (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) +
+        (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) + 
+        (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1), VEC_DATA_TYPE({{DST_DATA_TYPE}}, N0));
 )_";
         }
     }
@@ -231,20 +210,11 @@ TagLUT ClTemplateResize::get_tag_lut(const GpuKernelVariableTable &vtable, const
     lut["DST_DATA_TYPE"]   = get_cl_type_from_data_type(_dst->data_type());
     lut["CONSTANT_VALUE"]  = string_from_pixel_value(0, _src->data_type());
 
-    const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(_src->data_type())
-                                    && _attributes.interpolation_policy() == InterpolationPolicy::BILINEAR;
+    const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(1), _dst->dimension(1), _attributes.align_corners());
+    const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(2), _dst->dimension(2), _attributes.align_corners());
 
-    if(is_qasymm_bilinear)
-    {
-        const UniformQuantizationInfo qinfo = _src->quantization_info().uniform();
-        lut["SCALE"]                        = support::cpp11::to_string(qinfo.scale);
-        lut["OFFSET"]                       = support::cpp11::to_string(qinfo.offset);
-    }
-    else
-    {
-        lut["SCALE"]  = support::cpp11::to_string(1);
-        lut["OFFSET"] = support::cpp11::to_string(0);
-    }
+    lut["SCALE_X"] = float_to_string_with_full_precision(scale_x);
+    lut["SCALE_Y"] = float_to_string_with_full_precision(scale_y);
 
     return lut;
 }
@@ -256,16 +226,11 @@ CLBuildOptions ClTemplateResize::get_build_options(const IGpuTemplateComponentWr
     const unsigned int m0          = root_window.y().step();
     const unsigned int partial_n0  = _dst->dimension(0) % n0;
 
-    const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(1), _dst->dimension(1), _attributes.align_corners());
-    const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(2), _dst->dimension(2), _attributes.align_corners());
-
     CLBuildOptions build_opts;
 
     build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
     build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
     build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_n0));
-    build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x));
-    build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y));
 
     return build_opts;
 }
diff --git a/src/gpu/cl/kernels/ClScaleKernel.cpp b/src/gpu/cl/kernels/ClScaleKernel.cpp
index 6f16adc657..7f24aa2eb6 100644
--- a/src/gpu/cl/kernels/ClScaleKernel.cpp
+++ b/src/gpu/cl/kernels/ClScaleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -102,7 +102,6 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
     float scale_x = 0.f;
     float scale_y = 0.f;
     std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, _data_layout, info.align_corners);
-    const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) && info.interpolation_policy == InterpolationPolicy::BILINEAR;
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
     auto interpolation_policy_to_use = info.interpolation_policy;
@@ -141,17 +140,6 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
         build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
         build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOATING_POINT");
         build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
-        if(is_qasymm_bilinear)
-        {
-            const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
-            build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale));
-            build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset));
-        }
-        else
-        {
-            build_opts.add_option("-DSCALE=" + support::cpp11::to_string(1));
-            build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(0));
-        }
     }
     else if(_data_layout == DataLayout::NCHW)
     {
@@ -169,6 +157,8 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
         build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
         build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
         build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
+
+        const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) && info.interpolation_policy == InterpolationPolicy::BILINEAR;
         if(is_qasymm_bilinear)
         {
             const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
author	Gunes Bayir <gunes.bayir@arm.com>	2022-12-26 16:24:04 +0000
committer	Gunes Bayir <gunes.bayir@arm.com>	2022-12-29 17:27:33 +0000
commit	b7e8626717b2ef81b0d03284c8f6ffdbe9cd2245 (patch)
tree	8fb8197a4f69fb5024a5330e031d3d5c0b26bdad
parent	a5cb79f18685292bf5b63a0c484a58945320823d (diff)
download	ComputeLibrary-b7e8626717b2ef81b0d03284c8f6ffdbe9cd2245.tar.gz