From b9d38ee6378f3035f8dbad442223d3d9e2f3dc4f Mon Sep 17 00:00:00 2001
From: Frank Lei <frank.lei@arm.com>
Date: Tue, 5 Dec 2017 10:43:33 +0800
Subject: APPBROWSER-312 Fully connected performance optimization

Change-Id: Ie93fd630ebbad7b6ca8812cb5044b3f1908b45fd
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111830
Reviewed-by: Stephen Li <stephen.li@arm.com>
Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
---
 .../kernels/GCDirectConvolutionLayerKernel.cpp     | 19 ++++++++-
 .../kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp | 46 +++++++++++++++++-----
 .../kernels/GCGEMMMatrixMultiplyKernel.cpp         | 30 +++++++++++++-
 src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp   | 33 +++++++++++++++-
 .../GLES_COMPUTE/kernels/GCTransposeKernel.cpp     | 32 ++++++++++++---
 5 files changed, 142 insertions(+), 18 deletions(-)

(limited to 'src/core/GLES_COMPUTE/kernels')
diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
index b032bc5668..a7d721d035 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
@@ -53,7 +53,6 @@ template <unsigned int kernel_size>
 void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *bias, IGCTensor *output, const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
     ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
     ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
@@ -68,6 +67,24 @@ void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *inp
         ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
     }
 
+    // Get convolved dimensions
+    unsigned int owidth  = 0;
+    unsigned int oheight = 0;
+    std::tie(owidth, oheight) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_size, kernel_size, conv_info);
+
+    TensorShape output_shape = input->info()->tensor_shape();
+    output_shape.set(0, owidth);
+    output_shape.set(1, oheight);
+    output_shape.set(2, weights->info()->dimension(3));
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
     _conv_stride_x = std::get<0>(conv_info.stride());
     _conv_stride_y = std::get<1>(conv_info.stride());
     _conv_pad_x    = std::get<0>(conv_info.pad());
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
index 8625d371e5..944585daff 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
@@ -37,7 +37,7 @@
 using namespace arm_compute;
 
 GCGEMMMatrixAccumulateBiasesKernel::GCGEMMMatrixAccumulateBiasesKernel()
-    : _accum(nullptr), _biases(nullptr)
+    : _accum(nullptr), _biases(nullptr), _lws(gles::NDRange(1U, 1U, 1U))
 {
 }
 
@@ -51,14 +51,23 @@ void GCGEMMMatrixAccumulateBiasesKernel::configure(IGCTensor *accum, const IGCTe
     _accum  = accum;
 
     std::set<std::string> build_opts;
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws[0]));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws[1]));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws[2]));
 
     // Create kernel
     build_opts.emplace("#define GEMM_ACCUMULATE_BIASES");
+
+#define ACCUM_PROCESS_4X
+
+#if defined(ACCUM_PROCESS_4X)
+    build_opts.emplace("#define ACCUM_PROCESS_4X");
+#elif defined(ACCUM_PROCESS_8X) /* ACCUM_PROCESS_4X */
+    build_opts.emplace("#define ACCUM_PROCESS_8X");
+#endif                          /* ACCUM_PROCESS_4X */
     std::string dt_name = (accum->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
     build_opts.emplace(("#define " + dt_name));
+
     _kernel = GCKernelLibrary::get().create_kernel("gemm_accumulate_biases", build_opts);
 
     // Configure kernel window
@@ -70,13 +79,21 @@ void GCGEMMMatrixAccumulateBiasesKernel::configure(IGCTensor *accum, const IGCTe
     }
     else if(_accum->info()->data_type() == DataType::F16)
     {
+#if defined(ACCUM_PROCESS_4X)
         num_elems_processed_per_iteration = 4;
+#elif defined(ACCUM_PROCESS_8X) /* ACCUM_PROCESS_4X */
+        num_elems_processed_per_iteration = 8;
+#endif                          /* ACCUM_PROCESS_4X */
     }
 
-    Window win = calculate_max_window(*_accum->info(), Steps(num_elems_processed_per_iteration));
+    const int  accum_width         = accum->info()->dimension(0);
+    const int  accum_padding_right = ceil_to_multiple(accum_width, num_elems_processed_per_iteration * _lws[0]) - accum_width;
+    BorderSize border              = BorderSize(0, accum_padding_right, 0, 0);
+
+    Window win = calculate_max_enlarged_window(*_accum->info(), Steps(num_elems_processed_per_iteration), border);
 
-    AccessWindowStatic     biases_access(biases->info(), 0, 0, ceil_to_multiple(biases->info()->dimension(0), num_elems_processed_per_iteration), biases->info()->dimension(1));
-    AccessWindowHorizontal accum_access(_accum->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic biases_access(biases->info(), 0, 0, ceil_to_multiple(biases->info()->dimension(0), num_elems_processed_per_iteration * _lws[0]), biases->info()->dimension(1));
+    AccessWindowStatic accum_access(_accum->info(), 0, 0, accum_width + accum_padding_right, _accum->info()->dimension(1));
 
     update_window_and_padding(win, biases_access, accum_access);
 
@@ -107,13 +124,22 @@ void GCGEMMMatrixAccumulateBiasesKernel::run(const Window &window)
         }
         else if(_accum->info()->data_type() == DataType::F16)
         {
-            add_2D_tensor_argument(idx, _accum, BufferParam(1, 3), accum_slice);
-            add_1D_tensor_argument(idx, _biases, BufferParam(2, 3), biases_slice);
+#if defined(ACCUM_PROCESS_4X)
+            BufferParam param = { 1, 3 };
+            add_2D_tensor_argument(idx, _accum, param, accum_slice);
+            param.binding_point = 2;
+            add_1D_tensor_argument(idx, _biases, param, biases_slice);
+#elif defined(ACCUM_PROCESS_8X) /* ACCUM_PROCESS_4X */
+            BufferParam param             = { 1, 4 };
+            add_2D_tensor_argument(idx, _accum, param, accum_slice);
+            param.binding_point = 2;
+            add_1D_tensor_argument(idx, _biases, param, biases_slice);
+#endif                          /* ACCUM_PROCESS_4X */
         }
 
         _kernel.update_shader_params();
 
-        enqueue(*this, accum_slice);
+        enqueue(*this, accum_slice, _lws);
     }
     while(window.slide_window_slice_2D(accum_slice));
 }
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
index a75ab6b609..8179525470 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
@@ -118,9 +118,23 @@ void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTen
         switch(input0->info()->data_type())
         {
             case DataType::F16:
+                build_opts.emplace("#define DATA_TYPE_FP16");
+
+#define MM_PROCESS_4X_OPTIMIZED
+
+#if defined(MM_PROCESS_4X)
+                num_elems_processed_per_iteration_x = 4;
+                num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
+                build_opts.emplace("#define MM_PROCESS_4X");
+#elif defined(MM_PROCESS_4X_OPTIMIZED) /* MM_PROCESS_4X */
                 num_elems_processed_per_iteration_x = 4;
+                num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
+                build_opts.emplace("#define MM_PROCESS_4X_OPTIMIZED");
+#elif defined(MM_PROCESS_8X)           /* MM_PROCESS_4X */
+                num_elems_processed_per_iteration_x = 8;
                 num_elems_processed_per_iteration_y = 1;
-                build_opts.emplace("#define DATA_TYPE_FP16");
+                build_opts.emplace("#define MM_PROCESS_8X");
+#endif                                 /* MM_PROCESS_4X */
                 break;
 
             case DataType::F32:
@@ -143,8 +157,12 @@ void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTen
 
         win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
+#if defined(MM_PROCESS_4X_OPTIMIZED)
+        AccessWindowStatic input0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), 8), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y));
+#else  /* MM_PROCESS_4X_OPTIMIZED */
         AccessWindowStatic input0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), num_elems_processed_per_iteration_x), ceil_to_multiple(input0->info()->dimension(1),
                                          num_elems_processed_per_iteration_y));
+#endif /* MM_PROCESS_4X_OPTIMIZED */
         AccessWindowStatic    input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1));
         AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
 
@@ -185,9 +203,19 @@ void GCGEMMMatrixMultiplyKernel::run(const Window &window)
         switch(_input0->info()->data_type())
         {
             case DataType::F16:
+#if defined(MM_PROCESS_4X)
                 add_2D_tensor_argument(idx, _input0, BufferParam(1, 2), slice);
                 add_2D_tensor_argument(idx, _input1, BufferParam(2, 3), slice_b);
                 add_2D_tensor_argument(idx, _output, BufferParam(3, 3), slice);
+#elif defined(MM_PROCESS_4X_OPTIMIZED) /* MM_PROCESS_4X */
+                add_2D_tensor_argument(idx, _input0, BufferParam(1, 4), slice);
+                add_2D_tensor_argument(idx, _input1, BufferParam(2, 3), slice_b);
+                add_2D_tensor_argument(idx, _output, BufferParam(3, 3), slice);
+#elif defined(MM_PROCESS_8X)           /* MM_PROCESS_4X */
+                add_2D_tensor_argument(idx, _input0, BufferParam(1, 4), slice);
+                add_2D_tensor_argument(idx, _input1, BufferParam(2, 4), slice_b);
+                add_2D_tensor_argument(idx, _output, BufferParam(3, 4), slice);
+#endif                                 /* MM_PROCESS_4X */
                 break;
 
             case DataType::F32:
diff --git a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
index 97c4dc48a1..e849891c7c 100644
--- a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
@@ -107,7 +107,38 @@ void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, std::p
     else
     {
         build_opts.insert("#define IM2COL_REDUCED");
-        _num_elems_processed_per_iteration = 4 / input->info()->element_size();
+
+        if(input->info()->data_type() == DataType::F32)
+        {
+            _num_elems_processed_per_iteration = 4 / input->info()->element_size();
+        }
+        else if(input->info()->data_type() == DataType::F16)
+        {
+            int input_width  = input->info()->dimension(0);
+            int input_height = input->info()->dimension(1);
+
+            build_opts.insert("#define IMAGE_SIZE " + support::cpp11::to_string(input_width * input_height));
+            if(input_width % 8 == 0)
+            {
+                _num_elems_processed_per_iteration = 8;
+                build_opts.insert("#define IM2COL_REDUCED_8X");
+            }
+            else if(input_width % 4 == 0)
+            {
+                _num_elems_processed_per_iteration = 4;
+                build_opts.insert("#define IM2COL_REDUCED_4X");
+            }
+            else if(input_width % 2 == 0)
+            {
+                _num_elems_processed_per_iteration = 2;
+                build_opts.insert("#define IM2COL_REDUCED_2X");
+            }
+            else
+            {
+                _num_elems_processed_per_iteration = 2;
+                build_opts.insert("#define IM2COL_REDUCED_GENERIC");
+            }
+        }
 
         // Create kernel
         _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("im2col_reduced", build_opts));
diff --git a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
index 5bd34c2c85..acb998840b 100644
--- a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
@@ -64,12 +64,25 @@ void GCTransposeKernel::configure(const IGCTensor *input, IGCTensor *output)
     build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
     build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
 
+    // Configure kernel window
+    unsigned int num_elems_processed_per_iteration = 4;
+
+    if(input->info()->data_type() == DataType::F16)
+    {
+#define TRANSPOSE_8X8
+
+#if defined(TRANSPOSE_4X4)
+        build_opts.emplace(("#define TRANSPOSE_4X4"));
+        num_elems_processed_per_iteration = 4;
+#elif defined(TRANSPOSE_8X8) /* TRANSPOSE_4X4 */
+        build_opts.emplace(("#define TRANSPOSE_8X8"));
+        num_elems_processed_per_iteration = 8;
+#endif                       /* TRANSPOSE_4X4 */
+    }
+
     // Create kernel
     _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("transpose", build_opts));
 
-    // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = 4;
-
     Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
 
     AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
@@ -100,8 +113,17 @@ void GCTransposeKernel::run(const Window &window)
         }
         else if(_input->info()->data_type() == DataType::F16)
         {
-            add_2D_tensor_argument(idx, _input, BufferParam(1, 3), slice);
-            add_2D_tensor_argument(idx, _output, BufferParam(2, 3), slice);
+#if defined(TRANSPOSE_4X4)
+            BufferParam param = { 1, 3 };
+            add_2D_tensor_argument(idx, _input, param, slice);
+            param.binding_point = 2;
+            add_2D_tensor_argument(idx, _output, param, slice);
+#elif defined(TRANSPOSE_8X8) /* TRANSPOSE_4X4 */
+            BufferParam param = { 1, 4 };
+            add_2D_tensor_argument(idx, _input, param, slice);
+            param.binding_point = 2;
+            add_2D_tensor_argument(idx, _output, param, slice);
+#endif                       /* TRANSPOSE_4X4 */
         }
 
         _kernel.update_shader_params();
-- 
cgit v1.2.1