APPBROWSER-312 Fully connected performance optimization

Change-Id: Ie93fd630ebbad7b6ca8812cb5044b3f1908b45fd Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111830 Reviewed-by: Stephen Li <stephen.li@arm.com> Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
author: Frank Lei <frank.lei@arm.com> 2017-12-05 10:43:33 +0800
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:42:17 +0000
commit: b9d38ee6378f3035f8dbad442223d3d9e2f3dc4f (patch)
tree: 89a4b81430100a4a91902d5987ae42edc438012c /src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
parent: 397d58aa40b02a26923c34d8cd4ba274eac45963 (diff)
download: ComputeLibrary-b9d38ee6378f3035f8dbad442223d3d9e2f3dc4f.tar.gz
1 files changed, 29 insertions, 1 deletions
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
index a75ab6b609..8179525470 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
@@ -118,9 +118,23 @@ void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTen
         switch(input0->info()->data_type())
         {
             case DataType::F16:
+                build_opts.emplace("#define DATA_TYPE_FP16");
+
+#define MM_PROCESS_4X_OPTIMIZED
+
+#if defined(MM_PROCESS_4X)
+                num_elems_processed_per_iteration_x = 4;
+                num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
+                build_opts.emplace("#define MM_PROCESS_4X");
+#elif defined(MM_PROCESS_4X_OPTIMIZED) /* MM_PROCESS_4X */
                 num_elems_processed_per_iteration_x = 4;
+                num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
+                build_opts.emplace("#define MM_PROCESS_4X_OPTIMIZED");
+#elif defined(MM_PROCESS_8X)           /* MM_PROCESS_4X */
+                num_elems_processed_per_iteration_x = 8;
                 num_elems_processed_per_iteration_y = 1;
-                build_opts.emplace("#define DATA_TYPE_FP16");
+                build_opts.emplace("#define MM_PROCESS_8X");
+#endif                                 /* MM_PROCESS_4X */
                 break;
 
             case DataType::F32:
@@ -143,8 +157,12 @@ void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTen
 
         win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
+#if defined(MM_PROCESS_4X_OPTIMIZED)
+        AccessWindowStatic input0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), 8), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y));
+#else  /* MM_PROCESS_4X_OPTIMIZED */
         AccessWindowStatic input0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), num_elems_processed_per_iteration_x), ceil_to_multiple(input0->info()->dimension(1),
                                          num_elems_processed_per_iteration_y));
+#endif /* MM_PROCESS_4X_OPTIMIZED */
         AccessWindowStatic    input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1));
         AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
 
@@ -185,9 +203,19 @@ void GCGEMMMatrixMultiplyKernel::run(const Window &window)
         switch(_input0->info()->data_type())
         {
             case DataType::F16:
+#if defined(MM_PROCESS_4X)
                 add_2D_tensor_argument(idx, _input0, BufferParam(1, 2), slice);
                 add_2D_tensor_argument(idx, _input1, BufferParam(2, 3), slice_b);
                 add_2D_tensor_argument(idx, _output, BufferParam(3, 3), slice);
+#elif defined(MM_PROCESS_4X_OPTIMIZED) /* MM_PROCESS_4X */
+                add_2D_tensor_argument(idx, _input0, BufferParam(1, 4), slice);
+                add_2D_tensor_argument(idx, _input1, BufferParam(2, 3), slice_b);
+                add_2D_tensor_argument(idx, _output, BufferParam(3, 3), slice);
+#elif defined(MM_PROCESS_8X)           /* MM_PROCESS_4X */
+                add_2D_tensor_argument(idx, _input0, BufferParam(1, 4), slice);
+                add_2D_tensor_argument(idx, _input1, BufferParam(2, 4), slice_b);
+                add_2D_tensor_argument(idx, _output, BufferParam(3, 4), slice);
+#endif                                 /* MM_PROCESS_4X */
                 break;
 
             case DataType::F32:
author	Frank Lei <frank.lei@arm.com>	2017-12-05 10:43:33 +0800
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:42:17 +0000
commit	b9d38ee6378f3035f8dbad442223d3d9e2f3dc4f (patch)
tree	89a4b81430100a4a91902d5987ae42edc438012c /src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
parent	397d58aa40b02a26923c34d8cd4ba274eac45963 (diff)
download	ComputeLibrary-b9d38ee6378f3035f8dbad442223d3d9e2f3dc4f.tar.gz