aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/CL
diff options
context:
space:
mode:
authorVidhya Sudhan Loganathan <vidhyasudhan.loganathan@arm.com>2019-04-25 09:27:24 +0100
committerGian Marco Iodice <gianmarco.iodice@arm.com>2019-04-25 16:18:58 +0000
commit050471e40fc58cb5ea745701a43ec5b2b9586b81 (patch)
tree0ee684bcc93fae693686c391e42a2b824705aeb1 /src/runtime/CL
parentd038dafe3810d22c8664ceef4fe49aad77abdbd1 (diff)
downloadComputeLibrary-050471e40fc58cb5ea745701a43ec5b2b9586b81.tar.gz
COMPMID-1974 : Extend CLTuner to support different of level of tuning
Change-Id: I52e4a00a25e7f7a17050038cee7c30e508553722 Signed-off-by: Vidhya Sudhan Loganathan <vidhyasudhan.loganathan@arm.com> Reviewed-on: https://review.mlplatform.org/c/977 Comments-Addressed: Pablo Marquez <pablo.tello@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/runtime/CL')
-rw-r--r--src/runtime/CL/CLTuner.cpp122
-rw-r--r--src/runtime/CL/tuners/CLLWSList.cpp112
2 files changed, 160 insertions, 74 deletions
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index a262d6b95c..8f8d3e7c3a 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/CL/CLTuner.h"
+#include "arm_compute/runtime/CL/tuners/CLLWSList.h"
#include "arm_compute/core/CL/ICLKernel.h"
#include "arm_compute/core/Error.h"
@@ -31,42 +32,13 @@
#include <fstream>
#include <iostream>
#include <limits>
+#include <memory>
#include <string>
namespace arm_compute
{
-namespace
-{
-/** Utility function used to initialize the LWS values to test.
- * Only the LWS values which are power of 2 or satisfy the modulo conditions with GWS are taken into account by the CLTuner
- *
- * @param[in, out] lws Vector of LWS to test for a specific dimension
- * @param[in] gws Size of the GWS
- * @param[in] lws_max Max LKWS value allowed to be tested
- * @param[in] mod_let_one True if the results of the modulo operation between gws and the lws can be less than one.
- */
-void initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
-{
- lws.push_back(1);
-
- for(unsigned int i = 2; i <= lws_max; ++i)
- {
- // Power of two condition
- const bool is_power_of_two = (i & (i - 1)) == 0;
-
- // Condition for the module accordingly with the mod_let_one flag
- const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0;
-
- if(mod_cond || is_power_of_two)
- {
- lws.push_back(i);
- }
- }
-}
-} // namespace
-
CLTuner::CLTuner(bool tune_new_kernels)
- : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels)
+ : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuner_mode(CLTunerMode::EXHAUSTIVE)
{
}
@@ -88,6 +60,15 @@ bool CLTuner::tune_new_kernels() const
return _tune_new_kernels;
}
+void CLTuner::set_tuner_mode(CLTunerMode mode)
+{
+ _tuner_mode = mode;
+}
+CLTunerMode CLTuner::get_tuner_mode() const
+{
+ return _tuner_mode;
+}
+
void CLTuner::tune_kernel_static(ICLKernel &kernel)
{
ARM_COMPUTE_UNUSED(kernel);
@@ -182,61 +163,54 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
};
CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
- cl_ulong min_exec_time = std::numeric_limits<cl_ulong>::max();
+ cl::NDRange gws = ICLKernel::gws_from_window(kernel.window());
- cl::NDRange gws = ICLKernel::gws_from_window(kernel.window());
- cl::NDRange opt_lws = cl::NullRange;
+ // Run the kernel with default lws to be used as baseline
+ kernel.run(kernel.window(), queue_profiler);
- const unsigned int lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 64u);
- const unsigned int lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 32u);
- const unsigned int lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 32u);
+ queue_profiler.finish();
- std::vector<unsigned int> lws_x;
- std::vector<unsigned int> lws_y;
- std::vector<unsigned int> lws_z;
+ const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+ const cl_ulong end = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+ cl_ulong min_exec_time = end - start;
+ _kernel_event = nullptr;
- // Initialize the LWS values to test
- initialize_lws_values(lws_x, gws[0], lws_x_max, gws[2] > 16);
- initialize_lws_values(lws_y, gws[1], lws_y_max, gws[2] > 16);
- initialize_lws_values(lws_z, gws[2], lws_z_max, false);
+ cl::NDRange opt_lws = cl::NullRange;
- for(const auto &z : lws_z)
+ //Construct the list of LWS values to be tested based on the tuner mode.
+ auto lws_list = cl_tuner::CLLWSListFactory::get_lws_list(_tuner_mode, gws);
+ for(size_t i = 0; i < lws_list->size(); ++i)
{
- for(const auto &y : lws_y)
- {
- for(const auto &x : lws_x)
- {
- cl::NDRange lws_test = cl::NDRange(x, y, z);
-
- bool invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1);
+ cl::NDRange lws_test = (*lws_list)[i];
+ auto x = lws_test[0];
+ auto y = lws_test[1];
+ auto z = lws_test[2];
+ bool invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1);
- invalid_lws = invalid_lws || (x > gws[0]) || (y > gws[1]) || (z > gws[2]);
+ invalid_lws = invalid_lws || (x > gws[0]) || (y > gws[1]) || (z > gws[2]);
- if(invalid_lws)
- {
- continue;
- }
-
- //Set the Local-Workgroup-Size
- kernel.set_lws_hint(lws_test);
+ if(invalid_lws)
+ {
+ continue;
+ }
- // Run the kernel
- kernel.run(kernel.window(), queue_profiler);
+ //Set the Local-Workgroup-Size
+ kernel.set_lws_hint(lws_test);
- queue_profiler.finish();
+ // Run the kernel
+ kernel.run(kernel.window(), queue_profiler);
- const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
- const cl_ulong end = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
- const cl_ulong diff = end - start;
- _kernel_event = nullptr;
+ queue_profiler.finish();
- // Check the execution time
- if(diff < min_exec_time)
- {
- min_exec_time = diff;
- opt_lws = cl::NDRange(x, y, z);
- }
- }
+ const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+ const cl_ulong end = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+ const cl_ulong diff = end - start;
+ _kernel_event = nullptr;
+ // Check the execution time
+ if(diff < min_exec_time)
+ {
+ min_exec_time = diff;
+ opt_lws = cl::NDRange(x, y, z);
}
}
diff --git a/src/runtime/CL/tuners/CLLWSList.cpp b/src/runtime/CL/tuners/CLLWSList.cpp
new file mode 100644
index 0000000000..97134b1b2c
--- /dev/null
+++ b/src/runtime/CL/tuners/CLLWSList.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/tuners/CLLWSList.h"
+
+namespace arm_compute
+{
+namespace cl_tuner
+{
+size_t CLLWSList::size()
+{
+ return search_space_shape.total_size();
+}
+
+cl::NDRange CLLWSListExhaustive::operator[](size_t index)
+{
+ ARM_COMPUTE_ERROR_ON(index >= size());
+ auto coords = index2coords(search_space_shape, index);
+ return cl::NDRange(coords[0] + 1, coords[1] + 1, coords[2] + 1);
+}
+
+CLLWSListExhaustive::CLLWSListExhaustive(const cl::NDRange &gws)
+{
+ search_space_shape = TensorShape(std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x), std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y),
+ std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z));
+}
+
+cl::NDRange CLLWSListNormal::operator[](size_t index)
+{
+ ARM_COMPUTE_ERROR_ON(index >= size());
+ auto coords = index2coords(search_space_shape, index);
+ return cl::NDRange(_lws_x[coords[0]], _lws_y[coords[1]], _lws_z[coords[2]]);
+}
+
+CLLWSListNormal::CLLWSListNormal(const cl::NDRange &gws)
+{
+ auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x);
+ auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y);
+ auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z);
+
+ // Initialize the LWS values to test
+ initialize_lws_values(_lws_x, gws[0], lws_x_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
+ initialize_lws_values(_lws_y, gws[1], lws_y_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
+ initialize_lws_values(_lws_z, gws[2], lws_z_max, false);
+
+ search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size());
+}
+
+void CLLWSListNormal::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
+{
+ lws.push_back(1);
+
+ for(unsigned int i = 2; i <= lws_max; ++i)
+ {
+ // Power of two condition
+ const bool is_power_of_two = (i & (i - 1)) == 0;
+
+ // Condition for the module accordingly with the mod_let_one flag
+ const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0;
+
+ if(mod_cond || is_power_of_two)
+ {
+ lws.push_back(i);
+ }
+ }
+}
+
+CLLWSListRapid::CLLWSListRapid(const cl::NDRange &gws)
+{
+ auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 8u); // Limit exploration to 1 - 8
+ auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 4u); // Limit exploration to 1 - 4
+ auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 4u); // Limit exploration to 1 - 4
+
+ // Initialize the LWS values to test
+ initialize_lws_values(_lws_x, lws_x_max);
+ initialize_lws_values(_lws_y, lws_y_max);
+ initialize_lws_values(_lws_z, lws_z_max);
+
+ search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size());
+}
+
+void CLLWSListRapid::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int lws_max)
+{
+ lws.push_back(1);
+
+ for(unsigned int i = 2; i <= lws_max; i *= 4)
+ {
+ lws.push_back(i);
+ }
+}
+} // namespace cl_tuner
+} // namespace arm_compute