aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsteli01 <stephen.li@arm.com>2018-01-02 14:56:06 +0800
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:42:33 +0000
commitd064389293e4a71781984b2b24f3d44964812949 (patch)
tree7a74f8351d0b255ad9e25bd5b364654b9f0dc74d
parent1d08a310b7316f2b731e60ac36dc68989d15b546 (diff)
downloadComputeLibrary-d064389293e4a71781984b2b24f3d44964812949.tar.gz
APPBROWSER-357: Fix Transpose performance issue by tuning lws
Change-Id: Ia71435f6e5c5610e2b76d6d4eb61a8847ca42305 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/114829 Reviewed-by: Pablo Tello <pablo.tello@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Tested-by: Anthony Barbier <anthony.barbier@arm.com>
-rw-r--r--arm_compute/core/GLES_COMPUTE/IGCKernel.h14
-rw-r--r--src/core/GLES_COMPUTE/IGCKernel.cpp2
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp38
-rw-r--r--tests/benchmark/GLES_COMPUTE/Transpose.cpp62
4 files changed, 104 insertions, 12 deletions
diff --git a/arm_compute/core/GLES_COMPUTE/IGCKernel.h b/arm_compute/core/GLES_COMPUTE/IGCKernel.h
index 11b2b17e51..ee1e166424 100644
--- a/arm_compute/core/GLES_COMPUTE/IGCKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/IGCKernel.h
@@ -140,6 +140,17 @@ public:
*/
virtual void run(const Window &window) = 0;
+ /** Set the Local-Workgroup-Size hint
+ *
+ * @note This method should be called after the configuration of the kernel
+ *
+ * @param[in] lws_hint Local-Workgroup-Size to use
+ */
+ void set_lws_hint(gles::NDRange &lws_hint)
+ {
+ _lws_hint = lws_hint;
+ }
+
private:
/** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx.
*
@@ -159,7 +170,8 @@ private:
unsigned int num_arguments_per_tensor() const;
protected:
- GCKernel _kernel; /**< GLES kernel to run */
+ GCKernel _kernel; /**< GLES kernel to run */
+ gles::NDRange _lws_hint; /**< Local workgroup size hint for the GLES kernel */
};
/** Add the kernel to the command queue with the given window.
diff --git a/src/core/GLES_COMPUTE/IGCKernel.cpp b/src/core/GLES_COMPUTE/IGCKernel.cpp
index c60c1674bf..df9c798e42 100644
--- a/src/core/GLES_COMPUTE/IGCKernel.cpp
+++ b/src/core/GLES_COMPUTE/IGCKernel.cpp
@@ -62,7 +62,7 @@ void arm_compute::enqueue(IGCKernel &kernel, const Window &window, const gles::N
}
IGCKernel::IGCKernel()
- : _kernel()
+ : _kernel(), _lws_hint(gles::NDRange(1U, 1U, 1U))
{
}
diff --git a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
index 621c9693fe..5edc23b95e 100644
--- a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
@@ -23,7 +23,7 @@
*/
#include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -57,12 +57,22 @@ void GCTransposeKernel::configure(const IGCTensor *input, IGCTensor *output)
_input = input;
_output = output;
+ // for better performance
+ if(w_out < 512 && h_out < 512)
+ {
+ _lws_hint = gles::NDRange(8U, 1U, 1U);
+ }
+ else
+ {
+ _lws_hint = gles::NDRange(1U, 8U, 1U);
+ }
+
std::set<std::string> build_opts;
std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
build_opts.emplace(("#define " + dt_name));
- build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
- build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
- build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+ build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws_hint[0]));
+ build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws_hint[1]));
+ build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws_hint[2]));
// Configure kernel window
unsigned int num_elems_processed_per_iteration = 4;
@@ -91,13 +101,21 @@ void GCTransposeKernel::configure(const IGCTensor *input, IGCTensor *output)
// Create kernel
_kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("transpose", build_opts));
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
+ const unsigned int width_aligned = num_elems_processed_per_iteration * static_cast<unsigned int>(_lws_hint[0]);
+ const unsigned int height_aligned = num_elems_processed_per_iteration * static_cast<unsigned int>(_lws_hint[1]);
- AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
- AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
- update_window_and_padding(win, input_access, output_access);
+ AccessWindowStatic input_access(input->info(), 0, 0,
+ ceil_to_multiple(input->info()->dimension(0), width_aligned),
+ ceil_to_multiple(input->info()->dimension(1), height_aligned));
+ AccessWindowStatic output_access(output->info(), 0, 0,
+ ceil_to_multiple(output->info()->dimension(0), height_aligned),
+ ceil_to_multiple(output->info()->dimension(1), width_aligned));
- output_access.set_valid_region(win, input->info()->valid_region());
+ Window win = calculate_max_window(*input->info(), Steps(width_aligned, height_aligned));
+ win.set_dimension_step(Window::DimX, num_elems_processed_per_iteration);
+ win.set_dimension_step(Window::DimY, num_elems_processed_per_iteration);
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, output->info()->valid_region());
IGCKernel::configure(win);
}
@@ -135,7 +153,7 @@ void GCTransposeKernel::run(const Window &window)
}
_kernel.update_shader_params();
- enqueue(*this, slice);
+ enqueue(*this, slice, _lws_hint);
}
while(window.slide_window_slice_2D(slice));
}
diff --git a/tests/benchmark/GLES_COMPUTE/Transpose.cpp b/tests/benchmark/GLES_COMPUTE/Transpose.cpp
new file mode 100644
index 0000000000..a76ca8fc2d
--- /dev/null
+++ b/tests/benchmark/GLES_COMPUTE/Transpose.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h"
+#include "tests/GLES_COMPUTE/GCAccessor.h"
+#include "tests/benchmark/fixtures/TransposeFixture.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+namespace
+{
+const auto data_types = framework::dataset::make("DataType", { DataType::F16, DataType::F32 });
+} // namespace
+
+using GCTransposeFixture = TransposeFixture<GCTensor, GCTranspose, GCAccessor>;
+
+TEST_SUITE(GC)
+TEST_SUITE(Transpose)
+
+REGISTER_FIXTURE_DATA_TEST_CASE(RunSmall, GCTransposeFixture, framework::DatasetMode::PRECOMMIT,
+ framework::dataset::combine(datasets::Small2DShapes(), data_types));
+
+REGISTER_FIXTURE_DATA_TEST_CASE(RunLarge, GCTransposeFixture, framework::DatasetMode::NIGHTLY,
+ framework::dataset::combine(datasets::Large2DShapes(), data_types));
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute