From c78d4bca25886d279edfe3914c179261a1bfe505 Mon Sep 17 00:00:00 2001 From: Gian Marco Date: Thu, 25 Jan 2018 13:49:44 +0000 Subject: COMPMID-765 - Added third dimension for CLTuner Change-Id: I0a7ea4cde1dbf8edd28908dfff80928ef7e996c4 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/117647 Reviewed-by: Georgios Pinitas Tested-by: Jenkins --- src/runtime/CL/CLTuner.cpp | 51 ++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 18 deletions(-) (limited to 'src/runtime/CL/CLTuner.cpp') diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp index 7f5be86833..351f6751c3 100644 --- a/src/runtime/CL/CLTuner.cpp +++ b/src/runtime/CL/CLTuner.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -72,33 +72,48 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel) double min_exec_time = std::numeric_limits::max(); - cl::NDRange opt_lws = cl::NDRange(1, 1); + cl::NDRange opt_lws = cl::NDRange(1, 1, 1); - for(int y = 1; y <= 16; ++y) + const int x_step = std::max(1, kernel.window().x().step()); + const int y_step = std::max(1, kernel.window().y().step()); + const int z_step = std::max(1, kernel.window().z().step()); + const int x_end = kernel.window().x().end() - kernel.window().x().start() / x_step > 1 ? 16 : 1; + const int y_end = kernel.window().y().end() - kernel.window().y().start() / y_step > 1 ? 16 : 1; + const int z_end = kernel.window().z().end() - kernel.window().z().start() / z_step > 1 ? 8 : 1; + + for(int z = 1; z <= z_end; ++z) { - for(int x = 1; x <= 16; ++x) + for(int y = 1; y <= y_end; ++y) { - cl::NDRange lws_test = cl::NDRange(x, y); + for(int x = 1; x <= x_end; ++x) + { + if(x == 1 && y == 1 && z == 1) + { + continue; + } - //Set the Local-Workgroup-Size - kernel.set_lws_hint(lws_test); + cl::NDRange lws_test = cl::NDRange(x, y, z); - auto t_start = std::chrono::high_resolution_clock::now(); + //Set the Local-Workgroup-Size + kernel.set_lws_hint(lws_test); - // Run - kernel.run(kernel.window(), q); + auto t_start = std::chrono::high_resolution_clock::now(); - CLScheduler::get().sync(); + // Run + kernel.run(kernel.window(), q); - auto t_stop = std::chrono::high_resolution_clock::now(); + CLScheduler::get().sync(); - std::chrono::duration fp_nano = t_stop - t_start; + auto t_stop = std::chrono::high_resolution_clock::now(); - // Check the execution time - if(fp_nano.count() < min_exec_time) - { - min_exec_time = fp_nano.count(); - opt_lws = cl::NDRange(x, y); + std::chrono::duration fp_nano = t_stop - t_start; + + // Check the execution time + if(fp_nano.count() < min_exec_time) + { + min_exec_time = fp_nano.count(); + opt_lws = cl::NDRange(x, y, z); + } } } } -- cgit v1.2.1