From 471043616a869f0e696c8db4e1d0a62b45b4decf Mon Sep 17 00:00:00 2001
From: Pablo Tello <pablo.tello@arm.com>
Date: Wed, 27 Feb 2019 13:32:51 +0000
Subject: COMPMID-1882: Improve memory coalescence when reshaping the weights
 for CLDepthwiseConvolution

Change-Id: I97788d9e349f37fcd818d588d668e2d5e22fd568
Signed-off-by: giuros01 <giuseppe.rossini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/818
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
---
 src/core/CL/CLKernelLibrary.cpp                    | 10 +--
 .../cl_kernels/depthwise_convolution_quantized.cl  | 96 ++++++++++++----------
 .../CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp   |  5 +-
 .../CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp   | 16 +++-
 4 files changed, 76 insertions(+), 51 deletions(-)

diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index a86d8d0414..7476e19011 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -217,11 +217,11 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "depthwise_convolution_3x3_f16", "depthwise_convolution.cl" },
     { "depthwise_convolution_3x3_nhwc", "depthwise_convolution.cl" },
     { "depthwise_convolution_3x3_nhwc_stride1", "depthwise_convolution.cl" },
-    { "depthwise_convolution_3x3_quantized_nchw", "depthwise_convolution_quantized.cl" },
-    { "depthwise_convolution_3x3_quantized_nhwc", "depthwise_convolution_quantized.cl" },
-    { "depthwise_convolution_3x3_quantized_nhwc_stride1", "depthwise_convolution_quantized.cl" },
-    { "depthwise_convolution_3x3_quantized_dot8_nchw", "depthwise_convolution_quantized.cl" },
-    { "depthwise_convolution_3x3_quantized_dot8_nhwc_stride1", "depthwise_convolution_quantized.cl" },
+    { "dwc_3x3_native_qasymm8_nchw", "depthwise_convolution_quantized.cl" },
+    { "dwc_3x3_native_qasymm8_dot8_nchw", "depthwise_convolution_quantized.cl" },
+    { "dwc_3x3_reshaped_qasymm8_nhwc", "depthwise_convolution_quantized.cl" },
+    { "dwc_3x3_reshaped_qasymm8_stride1_nhwc", "depthwise_convolution_quantized.cl" },
+    { "dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc", "depthwise_convolution_quantized.cl" },
     { "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16", "depthwise_convolution.cl" },
     { "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16", "depthwise_convolution.cl" },
     { "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32", "depthwise_convolution.cl" },
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 606af2edb1..503aa7e837 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
@@ -117,7 +117,7 @@
  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
  */
 
-__kernel void depthwise_convolution_3x3_quantized_nchw(
+__kernel void dwc_3x3_native_qasymm8_nchw(
     TENSOR3D_DECLARATION(src),
     TENSOR3D_DECLARATION(dst),
     TENSOR3D_DECLARATION(weights)
@@ -254,7 +254,7 @@ __kernel void depthwise_convolution_3x3_quantized_nchw(
 
 #else // defined(REAL_MULTIPLIER)
 
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+    values0                  = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
 
 #endif // defined(REAL_MULTIPLIER)
 
@@ -271,7 +271,7 @@ __kernel void depthwise_convolution_3x3_quantized_nchw(
 
 #else // defined(REAL_MULTIPLIER)
 
-    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+    values1                  = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
 
 #endif // defined(REAL_MULTIPLIER)
 
@@ -349,7 +349,7 @@ __kernel void depthwise_convolution_3x3_quantized_nchw(
  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
  */
 
-__kernel void depthwise_convolution_3x3_quantized_dot8_nchw(
+__kernel void dwc_3x3_native_qasymm8_dot8_nchw(
     TENSOR3D_DECLARATION(src),
     TENSOR3D_DECLARATION(dst),
     TENSOR3D_DECLARATION(weights)
@@ -597,9 +597,10 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nchw(
 
 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 
-#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
+#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && VEC_SIZE == 4
 /** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1.
  *
+ * @note This kernel assumes VEC_SIZE is 4.
  * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
  * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
  * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
@@ -640,7 +641,7 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nchw(
  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
  * @param[in] max_offset                            Max offset for the input tensor
  */
-__kernel void depthwise_convolution_3x3_quantized_nhwc(
+__kernel void dwc_3x3_reshaped_qasymm8_nhwc(
     TENSOR4D_DECLARATION(src),
     TENSOR4D_DECLARATION(dst),
     IMAGE_DECLARATION(weights),
@@ -654,9 +655,9 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
 #if defined(DST_DEPTH)
     int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
     int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else // defined(DST_DEPTH)
-    int z = get_global_id(2); // spatial coordinate y
-#endif // defined(DST_DEPTH)
+#else                                          // defined(DST_DEPTH)
+    int      z               = get_global_id(2); // spatial coordinate y
+#endif                                         // defined(DST_DEPTH)
 
     __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
 
@@ -682,15 +683,19 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
     VEC_INT acc = 0, sum = 0;
 
     // Load weights
-    VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights_addr + 0);
-    VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights_addr + VEC_SIZE);
-    VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights_addr + 2 * VEC_SIZE);
-    VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights_addr + 3 * VEC_SIZE);
-    VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights_addr + 4 * VEC_SIZE);
-    VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights_addr + 5 * VEC_SIZE);
-    VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights_addr + 6 * VEC_SIZE);
-    VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights_addr + 7 * VEC_SIZE);
-    VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights_addr + 8 * VEC_SIZE);
+    uchar16 w0_tmp = VLOAD(16)(0, weights_addr);
+    uchar16 w1_tmp = VLOAD(16)(0, weights_addr + 16);
+    uchar4  w8     = VLOAD(4)(0, weights_addr + 2 * 16);
+
+    uchar4 w0 = w0_tmp.s0123;
+    uchar4 w1 = w0_tmp.s4567;
+    uchar4 w2 = w0_tmp.s89AB;
+    uchar4 w3 = w0_tmp.sCDEF;
+
+    uchar4 w4 = w1_tmp.s0123;
+    uchar4 w5 = w1_tmp.s4567;
+    uchar4 w6 = w1_tmp.s89AB;
+    uchar4 w7 = w1_tmp.sCDEF;
 
 #if INPUT_OFFSET != 0
     VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
@@ -766,7 +771,7 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
 
 #else  // defined(REAL_MULTIPLIER)
 
-    acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc                      = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
 #endif // defined(REAL_MULTIPLIER)
 
     acc += (VEC_INT)OUTPUT_OFFSET;
@@ -785,9 +790,10 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
 }
 #endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
 
-#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
+#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED) && VEC_SIZE == 4
 /** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1.
  *
+ * @note This kernel assumes VEC_SIZE is 4.
  * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
  * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
  * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
@@ -829,7 +835,7 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc(
  * @param[in] max_offset                            Max offset for the input tensor
  */
 
-__kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
+__kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc(
     TENSOR4D_DECLARATION(src),
     TENSOR4D_DECLARATION(dst),
     IMAGE_DECLARATION(weights),
@@ -843,9 +849,9 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
 #if defined(DST_DEPTH)
     int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
     int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else // defined(DST_DEPTH)
-    int z = get_global_id(2); // spatial coordinate y
-#endif // defined(DST_DEPTH)
+#else                                          // defined(DST_DEPTH)
+    int             z        = get_global_id(2); // spatial coordinate y
+#endif                                         // defined(DST_DEPTH)
 
     __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
 
@@ -874,15 +880,19 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
     VEC_INT acc3 = 0, sum3 = 0;
 
     // Load weights
-    VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights_addr + 0);
-    VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights_addr + VEC_SIZE);
-    VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights_addr + 2 * VEC_SIZE);
-    VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights_addr + 3 * VEC_SIZE);
-    VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights_addr + 4 * VEC_SIZE);
-    VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights_addr + 5 * VEC_SIZE);
-    VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights_addr + 6 * VEC_SIZE);
-    VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights_addr + 7 * VEC_SIZE);
-    VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights_addr + 8 * VEC_SIZE);
+    uchar16 w0_tmp = VLOAD(16)(0, weights_addr);
+    uchar16 w1_tmp = VLOAD(16)(0, weights_addr + 16);
+    uchar4  w8     = VLOAD(4)(0, weights_addr + 2 * 16);
+
+    uchar4 w0 = w0_tmp.s0123;
+    uchar4 w1 = w0_tmp.s4567;
+    uchar4 w2 = w0_tmp.s89AB;
+    uchar4 w3 = w0_tmp.sCDEF;
+
+    uchar4 w4 = w1_tmp.s0123;
+    uchar4 w5 = w1_tmp.s4567;
+    uchar4 w6 = w1_tmp.s89AB;
+    uchar4 w7 = w1_tmp.sCDEF;
 
 #if INPUT_OFFSET != 0
     VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
@@ -1020,10 +1030,10 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
 
 #else // defined(REAL_MULTIPLIER)
 
-    acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
-    acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
-    acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
-    acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc0                     = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc1                     = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc2                     = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc3                     = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
 
 #endif // defined(REAL_MULTIPLIER)
 
@@ -1110,7 +1120,7 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
  * @param[in] max_offset                            The maximum allowed offset for the input tensor
  */
-__kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
+__kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc(
     TENSOR4D_DECLARATION(src),
     TENSOR4D_DECLARATION(dst),
     IMAGE_DECLARATION(weights),
@@ -1124,9 +1134,9 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
 #if defined(DST_DEPTH)
     int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
     int b = get_global_id(2) / (int)DST_DEPTH; // batch
-#else // defined(DST_DEPTH)
-    int z = get_global_id(2); // spatial coordinate y
-#endif // defined(DST_DEPTH)
+#else                                          // defined(DST_DEPTH)
+    int      z               = get_global_id(2); // spatial coordinate y
+#endif                                         // defined(DST_DEPTH)
 
     __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
 
@@ -1255,8 +1265,8 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
 
 #else // defined(REAL_MULTIPLIER)
 
-    acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
-    acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc0                     = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc1                     = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
 
 #endif // defined(REAL_MULTIPLIER)
     acc0 += (VEC_INT)OUTPUT_OFFSET;
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
index e8efe6f0a9..770740d180 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
@@ -171,7 +171,10 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     {
         const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
 
-        kernel_name                       = is_qasymm ? (std::string("depthwise_convolution_3x3_quantized") + (is_dot8_supported ? "_dot8" : "") + "_nchw") : "depthwise_convolution_3x3";
+        kernel_name = is_qasymm ? "dwc_3x3_native_qasymm8" : "depthwise_convolution_3x3";
+        kernel_name += (is_qasymm && is_dot8_supported ? "_dot8" : "");
+        kernel_name += "_nchw";
+
         num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
         num_elems_written_per_iteration_y = (is_qasymm && conv_stride_y == 1) ? 2 : 1;
         num_elems_read_per_iteration_x    = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x;
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
index a82d3a6fae..431039c31f 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
@@ -252,9 +252,21 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input,
     build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1,
                              "-DDST_DEPTH=" + support::cpp11::to_string(static_cast<int>(std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)))));
 
+    std::string kernel_name;
     // Create kernel
-    std::string kernel_name = std::string("depthwise_convolution_3x3") + (is_qasymm ? std::string("_quantized") + ((is_dot8_supported
-                                                                                                                    && is_stride_1) ? "_dot8" : "") : "") + "_nhwc" + (is_stride_1 ? "_stride1" : "");
+    if(is_qasymm)
+    {
+        kernel_name = std::string("dwc_3x3_reshaped_qasymm8");
+        kernel_name += (is_dot8_supported && is_stride_1 ? "_dot8" : "");
+        kernel_name += (is_stride_1 ? "_stride1" : "");
+        kernel_name += "_nhwc";
+    }
+    else
+    {
+        kernel_name = std::string("depthwise_convolution_3x3_nhwc");
+        kernel_name += (is_stride_1 ? "_stride1" : "");
+    }
+
     ICLKernel::configure_internal(win_config.second);
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
-- 
cgit v1.2.1