APPBROWSER-313: Performance improvement for softmax layer

Process 8 elements at one time for better performance Change-Id: I90d31e5d0834c5096fdb82f174482ade762b63d2 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111840 Reviewed-by: Stephen Li <stephen.li@arm.com> Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
author: Joel Liang <joel.liang@arm.com> 2017-11-17 11:34:19 +0800
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:42:17 +0000
commit: 07c37f9954555ae3523c85f16e46cf94e9a9e290 (patch)
tree: 30b4fa60148b7a9a44e26b9488f3dcf482e11faa /src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
parent: d9f8071de655fd23529588d312dd5624b2d43315 (diff)
download: ComputeLibrary-07c37f9954555ae3523c85f16e46cf94e9a9e290.tar.gz
1 files changed, 11 insertions, 11 deletions
diff --git a/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
index 29a1385f87..040a66358f 100644
--- a/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
@@ -66,10 +66,10 @@ void GCLogits1DMaxKernel::configure(const IGCTensor *input, IGCTensor *output)
     build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
     build_opts.insert("#define SOFTMAX_LAYER_MAX");
 
-    // Tell the kernel that the width is not a multiple of 4
-    if((input->info()->dimension(0) % 4) != 0)
+    // Tell the kernel that the width is not a multiple of 8
+    if((input->info()->dimension(0) % 8) != 0)
     {
-        build_opts.insert("#define NON_MULTIPLE_OF_4");
+        build_opts.insert("#define NON_MULTIPLE_OF_8");
     }
 
     // Create kernel
@@ -80,8 +80,8 @@ void GCLogits1DMaxKernel::configure(const IGCTensor *input, IGCTensor *output)
     _kernel.set_argument(idx++, input->info()->dimension(0));
 
     // Configure kernel window
-    // The kernel loops over all elements in steps of 4
-    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 4);
+    // The kernel loops over all elements in steps of 8
+    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 8);
     unsigned int       num_elems_written_per_iteration   = 1;
     if(input->info()->data_type() == DataType::F16)
     {
@@ -131,10 +131,10 @@ void GCLogits1DShiftExpSumKernel::configure(const IGCTensor *input, const IGCTen
     build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
     build_opts.insert("#define SOFTMAX_LAYER_SHIFT_EXP_SUM");
 
-    // Tell the kernel that the width is not a multiple of 4
-    if((input->info()->dimension(0) % 4) != 0)
+    // Tell the kernel that the width is not a multiple of 8
+    if((input->info()->dimension(0) % 8) != 0)
     {
-        build_opts.insert("#define NON_MULTIPLE_OF_4");
+        build_opts.insert("#define NON_MULTIPLE_OF_8");
     }
 
     // Create kernel
@@ -145,8 +145,8 @@ void GCLogits1DShiftExpSumKernel::configure(const IGCTensor *input, const IGCTen
     _kernel.set_argument(idx++, input->info()->dimension(0));
 
     // Configure window
-    // The kernel loops over all elements in steps of 4
-    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 4);
+    // The kernel loops over all elements in steps of 8
+    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 8);
     unsigned int       num_elems_written_per_iteration   = 1;
     if(input->info()->data_type() == DataType::F16)
     {
@@ -227,7 +227,7 @@ void GCLogits1DNormKernel::configure(const IGCTensor *input, const IGCTensor *su
     _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("softmax_layer_norm", build_opts));
 
     // Configure window
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
     unsigned int           num_elems_written_per_iteration   = 1;
     if(input->info()->data_type() == DataType::F16)
     {
author	Joel Liang <joel.liang@arm.com>	2017-11-17 11:34:19 +0800
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:42:17 +0000
commit	07c37f9954555ae3523c85f16e46cf94e9a9e290 (patch)
tree	30b4fa60148b7a9a44e26b9488f3dcf482e11faa /src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
parent	d9f8071de655fd23529588d312dd5624b2d43315 (diff)
download	ComputeLibrary-07c37f9954555ae3523c85f16e46cf94e9a9e290.tar.gz