From fcd52fbc578a2f5e6a1df4c823284621cc55645a Mon Sep 17 00:00:00 2001
From: Anthony Barbier <anthony.barbier@arm.com>
Date: Tue, 28 Nov 2017 10:31:43 +0000
Subject: COMPMID-661: Vectorize im2col and add lws heuristics for convolution
 kernels #46

Change-Id: Idaab987384d6a12a114f609abd50446fd94536b2
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110879
Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
---
 src/core/CL/CLKernelLibrary.cpp                    |   1 +
 src/core/CL/cl_kernels/convolution_layer.cl        | 103 ++++++++++++++++++---
 src/core/CL/kernels/CLCol2ImKernel.cpp             |  15 +++
 src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp |  14 ++-
 src/core/CL/kernels/CLIm2ColKernel.cpp             |  54 ++++++++++-
 5 files changed, 172 insertions(+), 15 deletions(-)

(limited to 'src/core/CL')

diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index a4b88b8eb2..de75518a05 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -251,6 +251,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "hog_orientation_binning", "hog.cl" },
     { "hysteresis", "canny.cl" },
     { "im2col_generic", "convolution_layer.cl" },
+    { "im2col_generic_padx0_pady0", "convolution_layer.cl" },
     { "im2col_kernel3x3_padx0_pady0", "convolution_layer.cl" },
     { "im2col_reduced", "convolution_layer.cl" },
     { "init_level", "optical_flow_pyramid_lk.cl" },
diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/convolution_layer.cl
index c7e3e644f4..ce0849bf7a 100644
--- a/src/core/CL/cl_kernels/convolution_layer.cl
+++ b/src/core/CL/cl_kernels/convolution_layer.cl
@@ -117,27 +117,25 @@ __kernel void reshape_to_columns(
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  filter_depth                      The depth of the used filter
  * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
  */
 __kernel void im2col_generic(
     TENSOR3D_DECLARATION(src),
     IMAGE_DECLARATION(dst),
-    uint filter_depth,
     uint src_stride_w,
     uint dst_stride_w)
 {
     const int xc    = get_global_id(0);                // x coordinate in the convolved tensor
     const int yc    = get_global_id(1);                // y coordinate in the convolved tensor
-    const int ch    = get_global_id(2) % filter_depth; // input feature map
-    const int batch = get_global_id(2) / filter_depth; // the batch
+    const int ch    = get_global_id(2) % KERNEL_DEPTH; // input feature map
+    const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
 
-    // Calculate input indeces
+    // Calculate input indices
     const int xi = xc * STRIDE_X - PAD_LEFT;
     const int yi = yc * STRIDE_Y - PAD_TOP;
 
-    // Calculate output indeces
+    // Calculate output indices
     const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
     const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
 
@@ -199,27 +197,25 @@ __kernel void im2col_generic(
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  filter_depth                      The depth of the used filter
  * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
  */
 __kernel void im2col_kernel3x3_padx0_pady0(
     TENSOR3D_DECLARATION(src),
     IMAGE_DECLARATION(dst),
-    uint filter_depth,
     uint src_stride_w,
     uint dst_stride_w)
 {
     const int xc    = get_global_id(0);                // x coordinate in the convolved tensor
     const int yc    = get_global_id(1);                // y coordinate in the convolved tensor
-    const int ch    = get_global_id(2) % filter_depth; // input feature map
-    const int batch = get_global_id(2) / filter_depth; // the batch
+    const int ch    = get_global_id(2) % KERNEL_DEPTH; // input feature map
+    const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
 
-    // Calculate input indeces
+    // Calculate input indices
     const int xi = xc * STRIDE_X;
     const int yi = yc * STRIDE_Y;
 
-    // Calculate output indeces
+    // Calculate output indices
     const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
     const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
 
@@ -336,3 +332,86 @@ __kernel void im2col_reduced(
     }
 #endif // HAS_BIAS
 }
+
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
+/** This kernel reshapes the input tensor to a tensor used to perform convolution using GEMM when
+ * the kernel width is greater than 1 (except when the kernel size is 3x3) and pad_x == pad_y == 0.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float.
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4.
+ * @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_generic_padx0_pady0(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);                // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);                // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % KERNEL_DEPTH; // input feature map
+    const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X;
+    const int yi = yc * STRIDE_Y;
+    // Calculate output indices
+    const int xo                   = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+    const int yo                   = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+    __global uchar *input_ptr      = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
+    // Linearize convolution elements
+    for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
+    {
+        int last_x = 0;
+        for(int x = xi, x_e = xi + KERNEL_WIDTH; x + VECTOR_SIZE <= x_e; x += VECTOR_SIZE, output_ptr += VECTOR_SIZE)
+        {
+            VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+            row = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+            VSTORE(VECTOR_SIZE)
+            (row, 0, output_ptr);
+            last_x = x;
+        }
+        // Copy the remainder of the row by doing VLOAD(WIDTH_MOD_VECTOR_SIZE) and VSTORE(WIDTH_MOD_VECTOR_SIZE).
+        // Note that x and output_ptr have already been incremented by VECTOR_SIZE by the loop just before exit.
+#if WIDTH_MOD_VECTOR_SIZE == 1
+        *output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
+#elif WIDTH_MOD_VECTOR_SIZE > 1
+        VEC_DATA_TYPE(DATA_TYPE, WIDTH_MOD_VECTOR_SIZE)
+        row = VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
+        VSTORE(WIDTH_MOD_VECTOR_SIZE)
+        (row, 0, output_ptr);
+#endif /* WIDTH_MOD_VECTOR_SIZE */
+        output_ptr += WIDTH_MOD_VECTOR_SIZE;
+    } /* End of loop over KERNEL_HEIGHT */
+
+#ifdef HAS_BIAS
+    if(ch == (KERNEL_DEPTH - 1))
+    {
+#ifdef FIXED_POINT_POSITION
+        *output_ptr = (DATA_TYPE)(1 << FIXED_POINT_POSITION);
+#else  // FIXED_POINT_POSITION
+        *output_ptr       = 1.0f;
+#endif // FIXED_POINT_POSITION
+    }
+#endif // HAS_BIAS
+}
+#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
index 31cc6448c9..f2886c569a 100644
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp
@@ -72,6 +72,21 @@ void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::p
 
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("col2im", build_opts));
 
+    // Configure the local work size for Bifrost with a value obtained
+    // via exhaustive autotuning over 30 representative tensor shapes.
+    const GPUTarget gpu_target = get_arch_from_target(get_target());
+    if(gpu_target == GPUTarget::BIFROST)
+    {
+        if((_convolved_dims.first == 7) || (_convolved_dims.first == 14))
+        {
+            _lws_hint = cl::NDRange(1, 7, 1);
+        }
+        else
+        {
+            _lws_hint = cl::NDRange(1, 8, 1);
+        }
+    }
+
     // Configure window
     Window win = calculate_max_window(*input->info(), Steps());
 
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index d39dcdb336..16706dd748 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -68,7 +68,19 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
     GPUTarget arch_target = get_arch_from_target(get_target());
 
     // Configure LWS hint
-    _lws_hint = (output->info()->dimension(1) == 196) ? cl::NDRange(1, 7) : cl::NDRange(8, 8);
+    if(arch_target == GPUTarget::BIFROST && input1->info()->dimension(1) == 24)
+    {
+        // LWS optimized for the 11x11 AlexNet convolution on Bifrost.
+        _lws_hint = cl::NDRange(2, 2);
+    }
+    else if(output->info()->dimension(1) == 196)
+    {
+        _lws_hint = cl::NDRange(1, 7);
+    }
+    else
+    {
+        _lws_hint = cl::NDRange(8, 8);
+    }
 
     // Create build options
     CLBuildOptions build_opts;
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 07372c7b91..f7cf9a3cb4 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -53,7 +53,8 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
     _input  = input;
     _output = output;
 
-    const DataType data_type = input->info()->data_type();
+    const DataType  data_type  = input->info()->data_type();
+    const GPUTarget gpu_target = get_arch_from_target(get_target());
 
     // Create kernel
     CLBuildOptions build_opts;
@@ -98,6 +99,56 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
         if(kernel_dims.width == 3 && kernel_dims.height == 3 && !conv_info.has_padding())
         {
             kernel_name = "im2col_kernel3x3_padx0_pady0";
+
+            // Local work size optimized for the 3x3 MobileNets convolution on Bifrost.
+            if(gpu_target == GPUTarget::BIFROST && input->info()->dimension(0) == 224)
+            {
+                _lws_hint = cl::NDRange(2, 3, 3);
+            }
+        }
+        else if(kernel_dims.width > 1 && !conv_info.has_padding())
+        {
+            kernel_name = "im2col_generic_padx0_pady0";
+
+            // Optimized im2col is performed using one or more vector operations with the specified vector size
+            // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
+            // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3.
+            // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3.
+            // Using the vector size of 8, however, may be faster.
+            size_t vector_size = 4;
+            // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0
+            // is used instead.)
+            if(kernel_dims.width < vector_size)
+            {
+                vector_size = kernel_dims.width;
+            }
+            // Local work size and vector size optimized for the 11x11 AlexNet convolution on Bifrost.
+            if(gpu_target == GPUTarget::BIFROST && kernel_dims.width == 11)
+            {
+                _lws_hint   = cl::NDRange(1, 1, 1);
+                vector_size = 8;
+            }
+            const size_t width_mod_vector_size = kernel_dims.width % vector_size;
+            build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+            build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
+        }
+        else
+        {
+            if(gpu_target == GPUTarget::BIFROST)
+            {
+                const size_t input_channels = input->info()->dimension(2);
+                if((input_channels & (input_channels - 1)) == 0)
+                {
+                    // input_channels is a power of two
+                    _lws_hint = cl::NDRange(1, 1, 4);
+                }
+                else if(input_channels < 192 && (input_channels % 4) == 0)
+                {
+                    // input_channels is less than 192 and is a multiple of 4
+                    _lws_hint = cl::NDRange(1, 1, 2);
+                }
+                // otherwise the default is optimal
+            }
         }
         _run_func = &CLIm2ColKernel::run_generic;
     }
@@ -173,7 +224,6 @@ void CLIm2ColKernel::run_generic(const Window &window, cl::CommandQueue &queue)
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice_in);
         add_2D_tensor_argument(idx, _output, slice_out);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->dimension(2)));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[3]));
         enqueue(queue, *this, slice, _lws_hint);
-- 
cgit v1.2.1