From 07c37f9954555ae3523c85f16e46cf94e9a9e290 Mon Sep 17 00:00:00 2001 From: Joel Liang Date: Fri, 17 Nov 2017 11:34:19 +0800 Subject: APPBROWSER-313: Performance improvement for softmax layer Process 8 elements at one time for better performance Change-Id: I90d31e5d0834c5096fdb82f174482ade762b63d2 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111840 Reviewed-by: Stephen Li Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com Reviewed-by: Anthony Barbier --- .../GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp') diff --git a/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp index 29a1385f87..040a66358f 100644 --- a/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp @@ -66,10 +66,10 @@ void GCLogits1DMaxKernel::configure(const IGCTensor *input, IGCTensor *output) build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); build_opts.insert("#define SOFTMAX_LAYER_MAX"); - // Tell the kernel that the width is not a multiple of 4 - if((input->info()->dimension(0) % 4) != 0) + // Tell the kernel that the width is not a multiple of 8 + if((input->info()->dimension(0) % 8) != 0) { - build_opts.insert("#define NON_MULTIPLE_OF_4"); + build_opts.insert("#define NON_MULTIPLE_OF_8"); } // Create kernel @@ -80,8 +80,8 @@ void GCLogits1DMaxKernel::configure(const IGCTensor *input, IGCTensor *output) _kernel.set_argument(idx++, input->info()->dimension(0)); // Configure kernel window - // The kernel loops over all elements in steps of 4 - const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 4); + // The kernel loops over all elements in steps of 8 + const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 8); unsigned int num_elems_written_per_iteration = 1; if(input->info()->data_type() == DataType::F16) { @@ -131,10 +131,10 @@ void GCLogits1DShiftExpSumKernel::configure(const IGCTensor *input, const IGCTen build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); build_opts.insert("#define SOFTMAX_LAYER_SHIFT_EXP_SUM"); - // Tell the kernel that the width is not a multiple of 4 - if((input->info()->dimension(0) % 4) != 0) + // Tell the kernel that the width is not a multiple of 8 + if((input->info()->dimension(0) % 8) != 0) { - build_opts.insert("#define NON_MULTIPLE_OF_4"); + build_opts.insert("#define NON_MULTIPLE_OF_8"); } // Create kernel @@ -145,8 +145,8 @@ void GCLogits1DShiftExpSumKernel::configure(const IGCTensor *input, const IGCTen _kernel.set_argument(idx++, input->info()->dimension(0)); // Configure window - // The kernel loops over all elements in steps of 4 - const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 4); + // The kernel loops over all elements in steps of 8 + const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 8); unsigned int num_elems_written_per_iteration = 1; if(input->info()->data_type() == DataType::F16) { @@ -227,7 +227,7 @@ void GCLogits1DNormKernel::configure(const IGCTensor *input, const IGCTensor *su _kernel = static_cast(GCKernelLibrary::get().create_kernel("softmax_layer_norm", build_opts)); // Configure window - constexpr unsigned int num_elems_processed_per_iteration = 4; + constexpr unsigned int num_elems_processed_per_iteration = 8; unsigned int num_elems_written_per_iteration = 1; if(input->info()->data_type() == DataType::F16) { -- cgit v1.2.1