From a74923ce6f4077ab2aef3651818c45f73fef97fd Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Thu, 31 Jan 2019 17:06:54 +0000 Subject: COMPMID-1910: Improve CLTuner reducing the number of LWS to test Change-Id: I842120a2bcddc5bf8677ee4d0b1f9d379771b36b Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/622 Reviewed-by: Georgios Pinitas Tested-by: Arm Jenkins --- examples/graph_shufflenet.cpp | 4 +-- src/runtime/CL/CLTuner.cpp | 79 +++++++++++++++++++++++++++---------------- 2 files changed, 52 insertions(+), 31 deletions(-) diff --git a/examples/graph_shufflenet.cpp b/examples/graph_shufflenet.cpp index 39c2227022..e6016f0451 100644 --- a/examples/graph_shufflenet.cpp +++ b/examples/graph_shufflenet.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -57,7 +57,7 @@ public: // Set default layout if needed (Single kernel grouped convolution not yet supported int NHWC) if(!common_opts.data_layout->is_set()) { - common_params.data_layout = DataLayout::NCHW; + common_params.data_layout = DataLayout::NHWC; } // Checks diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp index 5f82cd3fbe..c09914cea2 100644 --- a/src/runtime/CL/CLTuner.cpp +++ b/src/runtime/CL/CLTuner.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -33,7 +33,37 @@ #include #include -using namespace arm_compute; +namespace arm_compute +{ +namespace +{ +/** Utility function used to initialize the LWS values to test. + * Only the LWS values which are power of 2 or satisfy the modulo conditions with GWS are taken into account by the CLTuner + * + * @param[in, out] lws Vector of LWS to test for a specific dimension + * @param[in] gws Size of the GWS + * @param[in] lws_max Max LKWS value allowed to be tested + * @param[in] mod_let_one True if the results of the modulo operation between gws and the lws can be less than one. + */ +void initialize_lws_values(std::vector &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one) +{ + lws.push_back(1); + + for(unsigned int i = 2; i <= lws_max; ++i) + { + // Power of two condition + const bool is_power_of_two = (i & (i - 1)) == 0; + + // Condition for the module accordingly with the mod_let_one flag + const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0; + + if(mod_cond || is_power_of_two) + { + lws.push_back(i); + } + } +} +} // namespace CLTuner::CLTuner(bool tune_new_kernels) : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event(), _tune_new_kernels(tune_new_kernels) @@ -145,43 +175,33 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel) cl_ulong min_exec_time = std::numeric_limits::max(); + cl::NDRange gws = ICLKernel::gws_from_window(kernel.window()); cl::NDRange opt_lws = cl::NullRange; - const int x_step = std::max(1, kernel.window().x().step()); - const int y_step = std::max(1, kernel.window().y().step()); - const int z_step = std::max(1, kernel.window().z().step()); - const int x_end = kernel.window().x().end() - kernel.window().x().start() / x_step > 1 ? 16 : 1; - const int y_end = kernel.window().y().end() - kernel.window().y().start() / y_step > 1 ? 16 : 1; - const int z_end = kernel.window().z().end() - kernel.window().z().start() / z_step > 1 ? 8 : 1; - - // First run using the default LWS - { - cl::NDRange lws_test = cl::NullRange; - - kernel.set_lws_hint(lws_test); - - // Run the kernel - kernel.run(kernel.window(), _queue_profiler); + const unsigned int lws_x_max = std::min(static_cast(gws[0]), 64u); + const unsigned int lws_y_max = std::min(static_cast(gws[1]), 32u); + const unsigned int lws_z_max = std::min(static_cast(gws[2]), 32u); - _queue_profiler.finish(); + std::vector lws_x; + std::vector lws_y; + std::vector lws_z; - const cl_ulong start = _kernel_event.getProfilingInfo(); - const cl_ulong end = _kernel_event.getProfilingInfo(); - const cl_ulong diff = end - start; - _kernel_event = nullptr; + // Initialize the LWS values to test + initialize_lws_values(lws_x, gws[0], lws_x_max, gws[2] > 16); + initialize_lws_values(lws_y, gws[1], lws_y_max, gws[2] > 16); + initialize_lws_values(lws_z, gws[2], lws_z_max, false); - min_exec_time = diff; - } - - for(int z = 1; z <= z_end; ++z) + for(const auto &z : lws_z) { - for(int y = 1; y <= y_end; ++y) + for(const auto &y : lws_y) { - for(int x = 1; x <= x_end; ++x) + for(const auto &x : lws_x) { cl::NDRange lws_test = cl::NDRange(x, y, z); - const bool invalid_lws = (x * y * z > static_cast(kernel.get_max_workgroup_size())) || (x == 1 && y == 1 && z == 1); + bool invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1); + + invalid_lws = invalid_lws || (x > gws[0]) || (y > gws[1]) || (z > gws[2]); if(invalid_lws) { @@ -278,3 +298,4 @@ void CLTuner::save_to_file(const std::string &filename) const } fs.close(); } +} // namespace arm_compute \ No newline at end of file -- cgit v1.2.1