aboutsummaryrefslogtreecommitdiff
path: root/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
diff options
context:
space:
mode:
authorsteli01 <stephen.li@arm.com>2018-01-02 14:56:06 +0800
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:42:33 +0000
commitd064389293e4a71781984b2b24f3d44964812949 (patch)
tree7a74f8351d0b255ad9e25bd5b364654b9f0dc74d /src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
parent1d08a310b7316f2b731e60ac36dc68989d15b546 (diff)
downloadComputeLibrary-d064389293e4a71781984b2b24f3d44964812949.tar.gz
APPBROWSER-357: Fix Transpose performance issue by tuning lws
Change-Id: Ia71435f6e5c5610e2b76d6d4eb61a8847ca42305 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/114829 Reviewed-by: Pablo Tello <pablo.tello@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Tested-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp')
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp38
1 files changed, 28 insertions, 10 deletions
diff --git a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
index 621c9693fe..5edc23b95e 100644
--- a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
@@ -23,7 +23,7 @@
*/
#include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -57,12 +57,22 @@ void GCTransposeKernel::configure(const IGCTensor *input, IGCTensor *output)
_input = input;
_output = output;
+ // for better performance
+ if(w_out < 512 && h_out < 512)
+ {
+ _lws_hint = gles::NDRange(8U, 1U, 1U);
+ }
+ else
+ {
+ _lws_hint = gles::NDRange(1U, 8U, 1U);
+ }
+
std::set<std::string> build_opts;
std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
build_opts.emplace(("#define " + dt_name));
- build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
- build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
- build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+ build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws_hint[0]));
+ build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws_hint[1]));
+ build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws_hint[2]));
// Configure kernel window
unsigned int num_elems_processed_per_iteration = 4;
@@ -91,13 +101,21 @@ void GCTransposeKernel::configure(const IGCTensor *input, IGCTensor *output)
// Create kernel
_kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("transpose", build_opts));
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
+ const unsigned int width_aligned = num_elems_processed_per_iteration * static_cast<unsigned int>(_lws_hint[0]);
+ const unsigned int height_aligned = num_elems_processed_per_iteration * static_cast<unsigned int>(_lws_hint[1]);
- AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
- AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
- update_window_and_padding(win, input_access, output_access);
+ AccessWindowStatic input_access(input->info(), 0, 0,
+ ceil_to_multiple(input->info()->dimension(0), width_aligned),
+ ceil_to_multiple(input->info()->dimension(1), height_aligned));
+ AccessWindowStatic output_access(output->info(), 0, 0,
+ ceil_to_multiple(output->info()->dimension(0), height_aligned),
+ ceil_to_multiple(output->info()->dimension(1), width_aligned));
- output_access.set_valid_region(win, input->info()->valid_region());
+ Window win = calculate_max_window(*input->info(), Steps(width_aligned, height_aligned));
+ win.set_dimension_step(Window::DimX, num_elems_processed_per_iteration);
+ win.set_dimension_step(Window::DimY, num_elems_processed_per_iteration);
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, output->info()->valid_region());
IGCKernel::configure(win);
}
@@ -135,7 +153,7 @@ void GCTransposeKernel::run(const Window &window)
}
_kernel.update_shader_params();
- enqueue(*this, slice);
+ enqueue(*this, slice, _lws_hint);
}
while(window.slide_window_slice_2D(slice));
}