From d064389293e4a71781984b2b24f3d44964812949 Mon Sep 17 00:00:00 2001 From: steli01 Date: Tue, 2 Jan 2018 14:56:06 +0800 Subject: APPBROWSER-357: Fix Transpose performance issue by tuning lws Change-Id: Ia71435f6e5c5610e2b76d6d4eb61a8847ca42305 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/114829 Reviewed-by: Pablo Tello Reviewed-by: Gian Marco Iodice Tested-by: Anthony Barbier --- arm_compute/core/GLES_COMPUTE/IGCKernel.h | 14 ++++- src/core/GLES_COMPUTE/IGCKernel.cpp | 2 +- .../GLES_COMPUTE/kernels/GCTransposeKernel.cpp | 38 +++++++++---- tests/benchmark/GLES_COMPUTE/Transpose.cpp | 62 ++++++++++++++++++++++ 4 files changed, 104 insertions(+), 12 deletions(-) create mode 100644 tests/benchmark/GLES_COMPUTE/Transpose.cpp diff --git a/arm_compute/core/GLES_COMPUTE/IGCKernel.h b/arm_compute/core/GLES_COMPUTE/IGCKernel.h index 11b2b17e51..ee1e166424 100644 --- a/arm_compute/core/GLES_COMPUTE/IGCKernel.h +++ b/arm_compute/core/GLES_COMPUTE/IGCKernel.h @@ -140,6 +140,17 @@ public: */ virtual void run(const Window &window) = 0; + /** Set the Local-Workgroup-Size hint + * + * @note This method should be called after the configuration of the kernel + * + * @param[in] lws_hint Local-Workgroup-Size to use + */ + void set_lws_hint(gles::NDRange &lws_hint) + { + _lws_hint = lws_hint; + } + private: /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx. * @@ -159,7 +170,8 @@ private: unsigned int num_arguments_per_tensor() const; protected: - GCKernel _kernel; /**< GLES kernel to run */ + GCKernel _kernel; /**< GLES kernel to run */ + gles::NDRange _lws_hint; /**< Local workgroup size hint for the GLES kernel */ }; /** Add the kernel to the command queue with the given window. diff --git a/src/core/GLES_COMPUTE/IGCKernel.cpp b/src/core/GLES_COMPUTE/IGCKernel.cpp index c60c1674bf..df9c798e42 100644 --- a/src/core/GLES_COMPUTE/IGCKernel.cpp +++ b/src/core/GLES_COMPUTE/IGCKernel.cpp @@ -62,7 +62,7 @@ void arm_compute::enqueue(IGCKernel &kernel, const Window &window, const gles::N } IGCKernel::IGCKernel() - : _kernel() + : _kernel(), _lws_hint(gles::NDRange(1U, 1U, 1U)) { } diff --git a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp index 621c9693fe..5edc23b95e 100644 --- a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h" -#include "arm_compute/core/AccessWindowTranspose.h" +#include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" @@ -57,12 +57,22 @@ void GCTransposeKernel::configure(const IGCTensor *input, IGCTensor *output) _input = input; _output = output; + // for better performance + if(w_out < 512 && h_out < 512) + { + _lws_hint = gles::NDRange(8U, 1U, 1U); + } + else + { + _lws_hint = gles::NDRange(1U, 8U, 1U); + } + std::set build_opts; std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16"; build_opts.emplace(("#define " + dt_name)); - build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); - build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); - build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws_hint[0])); + build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws_hint[1])); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws_hint[2])); // Configure kernel window unsigned int num_elems_processed_per_iteration = 4; @@ -91,13 +101,21 @@ void GCTransposeKernel::configure(const IGCTensor *input, IGCTensor *output) // Create kernel _kernel = static_cast(GCKernelLibrary::get().create_kernel("transpose", build_opts)); - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration)); + const unsigned int width_aligned = num_elems_processed_per_iteration * static_cast(_lws_hint[0]); + const unsigned int height_aligned = num_elems_processed_per_iteration * static_cast(_lws_hint[1]); - AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration); - AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration); - update_window_and_padding(win, input_access, output_access); + AccessWindowStatic input_access(input->info(), 0, 0, + ceil_to_multiple(input->info()->dimension(0), width_aligned), + ceil_to_multiple(input->info()->dimension(1), height_aligned)); + AccessWindowStatic output_access(output->info(), 0, 0, + ceil_to_multiple(output->info()->dimension(0), height_aligned), + ceil_to_multiple(output->info()->dimension(1), width_aligned)); - output_access.set_valid_region(win, input->info()->valid_region()); + Window win = calculate_max_window(*input->info(), Steps(width_aligned, height_aligned)); + win.set_dimension_step(Window::DimX, num_elems_processed_per_iteration); + win.set_dimension_step(Window::DimY, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, output->info()->valid_region()); IGCKernel::configure(win); } @@ -135,7 +153,7 @@ void GCTransposeKernel::run(const Window &window) } _kernel.update_shader_params(); - enqueue(*this, slice); + enqueue(*this, slice, _lws_hint); } while(window.slide_window_slice_2D(slice)); } diff --git a/tests/benchmark/GLES_COMPUTE/Transpose.cpp b/tests/benchmark/GLES_COMPUTE/Transpose.cpp new file mode 100644 index 0000000000..a76ca8fc2d --- /dev/null +++ b/tests/benchmark/GLES_COMPUTE/Transpose.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h" +#include "tests/GLES_COMPUTE/GCAccessor.h" +#include "tests/benchmark/fixtures/TransposeFixture.h" +#include "tests/datasets/ShapeDatasets.h" +#include "tests/framework/Macros.h" +#include "tests/framework/datasets/Datasets.h" +#include "utils/TypePrinter.h" + +namespace arm_compute +{ +namespace test +{ +namespace benchmark +{ +namespace +{ +const auto data_types = framework::dataset::make("DataType", { DataType::F16, DataType::F32 }); +} // namespace + +using GCTransposeFixture = TransposeFixture; + +TEST_SUITE(GC) +TEST_SUITE(Transpose) + +REGISTER_FIXTURE_DATA_TEST_CASE(RunSmall, GCTransposeFixture, framework::DatasetMode::PRECOMMIT, + framework::dataset::combine(datasets::Small2DShapes(), data_types)); + +REGISTER_FIXTURE_DATA_TEST_CASE(RunLarge, GCTransposeFixture, framework::DatasetMode::NIGHTLY, + framework::dataset::combine(datasets::Large2DShapes(), data_types)); + +TEST_SUITE_END() +TEST_SUITE_END() +} // namespace benchmark +} // namespace test +} // namespace arm_compute -- cgit v1.2.1