aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGian Marco <gianmarco.iodice@arm.com>2017-09-08 16:13:11 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:35:24 +0000
commitde691f055ac255c798a766483eef63465ac90c75 (patch)
tree0929f439b048ffa2b2fc8222388f9ee14f3a2a2e /src
parent54f366afa63522b8c0ea3b0e5e8d3012a4412681 (diff)
downloadComputeLibrary-de691f055ac255c798a766483eef63465ac90c75.tar.gz
COMPMID-524 - Implemented CLTuner object
Change-Id: Idbdbecca1fc299ed042936119d90e2bed8db0938 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/87101 Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/core/CL/ICLKernel.cpp2
-rw-r--r--src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp18
-rw-r--r--src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp8
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp11
-rw-r--r--src/core/CL/kernels/CLIm2ColKernel.cpp12
-rw-r--r--src/runtime/CL/CLScheduler.cpp13
-rw-r--r--src/runtime/CL/CLTuner.cpp118
7 files changed, 176 insertions, 6 deletions
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 7a95374bbf..b0ac40adf7 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -60,7 +60,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind
}
ICLKernel::ICLKernel()
- : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD)
+ : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id("")
{
}
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 75e6d5e971..4224d9bb8e 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -230,6 +230,24 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
ICLKernel::configure(win);
}
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "direct_convolution_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(kernel_size);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(_conv_pad_x);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(_conv_pad_y);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(_conv_stride_x);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(_conv_stride_y);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
}
void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
index 5b6e0ec6af..268260b8d5 100644
--- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
@@ -81,6 +81,14 @@ void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *out
output_access.set_valid_region(win, input->info()->valid_region());
ICLKernel::configure(win);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "interleave4x4_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
}
void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 684e3232d5..b184c507ff 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -157,6 +157,17 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
ICLKernel::configure(win);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "gemm_";
+ _config_id += (is_interleaved_transposed ? "reshaped_" : "");
+ _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
}
}
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 3d21a9e3c0..98a799f783 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -123,6 +123,15 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
}
ICLKernel::configure(win);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "im2col_";
+ _config_id += (run_img2col_reduced ? "reduced_" : "");
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
}
void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -160,9 +169,6 @@ void CLIm2ColKernel::run_generic(const Window &window, cl::CommandQueue &queue)
slice_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 1));
slice_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
- // Set the local-workgroup size
- _lws_hint = cl::NDRange(4, 4, 4);
-
do
{
unsigned int idx = 0;
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index f413f626eb..71a749fe52 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -24,11 +24,12 @@
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/CL/CLTuner.h"
using namespace arm_compute;
CLScheduler::CLScheduler()
- : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false)
+ : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner()
{
}
@@ -44,10 +45,18 @@ void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
"The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!");
+ // Tune the kernel if the CLTuner has been provided
+ if(_cl_tuner != nullptr)
+ {
+ // Tune the OpenCL kernel
+ _cl_tuner->tune_kernel(kernel);
+ }
+
+ // Run kernel
kernel.run(kernel.window(), _queue);
if(flush)
{
_queue.flush();
}
-}
+} \ No newline at end of file
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
new file mode 100644
index 0000000000..f3300d3f83
--- /dev/null
+++ b/src/runtime/CL/CLTuner.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLTuner.h"
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <chrono>
+#include <limits>
+#include <string>
+
+using namespace arm_compute;
+
+CLTuner::CLTuner()
+ : _lws_table()
+{
+}
+
+void CLTuner::tune_kernel(ICLKernel &kernel)
+{
+ // Get the configuration ID from the kernel
+ const std::string &config_id = kernel.config_id();
+
+ // Check if we need to find the Optimal LWS. If config_id is empty, the kernel does not require to be tuned
+ if(config_id != "")
+ {
+ auto p = _lws_table.find(config_id);
+
+ if(p == _lws_table.end())
+ {
+ // Find the optimal LWS for the kernel
+ cl::NDRange opt_lws = find_optimal_lws(kernel);
+
+ // Insert the optimal LWS in the table
+ _lws_table.emplace(config_id, opt_lws);
+
+ // Set Local-Workgroup-Size
+ kernel.set_lws_hint(opt_lws);
+ }
+ else
+ {
+ // Set Local-Workgroup-Size
+ kernel.set_lws_hint(p->second);
+ }
+ }
+}
+
+cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+
+ double min_exec_time = std::numeric_limits<double>::max();
+
+ cl::NDRange opt_lws = cl::NDRange(1, 1);
+
+ for(int y = 1; y <= 16; ++y)
+ {
+ for(int x = 1; x <= 16; ++x)
+ {
+ cl::NDRange lws_test = cl::NDRange(x, y);
+
+ //Set the Local-Workgroup-Size
+ kernel.set_lws_hint(lws_test);
+
+ auto t_start = std::chrono::high_resolution_clock::now();
+
+ // Run
+ kernel.run(kernel.window(), q);
+
+ CLScheduler::get().sync();
+
+ auto t_stop = std::chrono::high_resolution_clock::now();
+
+ std::chrono::duration<double, std::nano> fp_nano = t_stop - t_start;
+
+ // Check the execution time
+ if(fp_nano.count() < min_exec_time)
+ {
+ min_exec_time = fp_nano.count();
+ opt_lws = cl::NDRange(x, y);
+ }
+ }
+ }
+
+ return opt_lws;
+}
+
+void CLTuner::import_lws_table(const std::unordered_map<std::string, cl::NDRange> &lws_table)
+{
+ _lws_table.clear();
+ _lws_table = lws_table;
+}
+
+const std::unordered_map<std::string, cl::NDRange> &CLTuner::export_lws_table()
+{
+ return _lws_table;
+} \ No newline at end of file