aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco <gianmarco.iodice@arm.com>2017-09-08 16:13:11 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:35:24 +0000
commitde691f055ac255c798a766483eef63465ac90c75 (patch)
tree0929f439b048ffa2b2fc8222388f9ee14f3a2a2e
parent54f366afa63522b8c0ea3b0e5e8d3012a4412681 (diff)
downloadComputeLibrary-de691f055ac255c798a766483eef63465ac90c75.tar.gz
COMPMID-524 - Implemented CLTuner object
Change-Id: Idbdbecca1fc299ed042936119d90e2bed8db0938 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/87101 Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
-rw-r--r--arm_compute/core/CL/ICLKernel.h24
-rw-r--r--arm_compute/runtime/CL/CLScheduler.h33
-rw-r--r--arm_compute/runtime/CL/CLTuner.h73
-rw-r--r--arm_compute/runtime/CL/ICLTuner.h44
-rw-r--r--src/core/CL/ICLKernel.cpp2
-rw-r--r--src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp18
-rw-r--r--src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp8
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp11
-rw-r--r--src/core/CL/kernels/CLIm2ColKernel.cpp12
-rw-r--r--src/runtime/CL/CLScheduler.cpp13
-rw-r--r--src/runtime/CL/CLTuner.cpp118
11 files changed, 341 insertions, 15 deletions
diff --git a/arm_compute/core/CL/ICLKernel.h b/arm_compute/core/CL/ICLKernel.h
index 1334c54a6c..d118d13f3f 100644
--- a/arm_compute/core/CL/ICLKernel.h
+++ b/arm_compute/core/CL/ICLKernel.h
@@ -29,6 +29,8 @@
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/IKernel.h"
+#include <string>
+
namespace arm_compute
{
template <typename T>
@@ -140,6 +142,21 @@ public:
_lws_hint = lws_hint;
}
+ /** Get the configuration ID
+ *
+ * @note The configuration ID can be used by the caller to distinguish different calls of the same OpenCL kernel
+ * In particular, this method can be used by CLScheduler to keep track of the best LWS for each configuration of the same kernel.
+ * The configuration ID should be provided only for the kernels potentially affected by the LWS geometry
+ *
+ * @note This method should be called after the configuration of the kernel
+ *
+ * @return configuration id string
+ */
+ const std::string &config_id() const
+ {
+ return _config_id;
+ }
+
/** Set the targeted GPU architecture
*
* @param[in] target The targeted GPU architecture
@@ -191,9 +208,10 @@ private:
unsigned int num_arguments_per_tensor() const;
protected:
- cl::Kernel _kernel; /**< OpenCL kernel to run */
- cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */
- GPUTarget _target; /**< The targeted GPU */
+ cl::Kernel _kernel; /**< OpenCL kernel to run */
+ cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */
+ GPUTarget _target; /**< The targeted GPU */
+ std::string _config_id; /**< Configuration ID */
};
/** Add the kernel to the command queue with the given window.
diff --git a/arm_compute/runtime/CL/CLScheduler.h b/arm_compute/runtime/CL/CLScheduler.h
index 3f3a8de753..11affebc48 100644
--- a/arm_compute/runtime/CL/CLScheduler.h
+++ b/arm_compute/runtime/CL/CLScheduler.h
@@ -30,6 +30,7 @@
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTuner.h"
namespace arm_compute
{
@@ -41,6 +42,10 @@ class CLScheduler
private:
/** Constructor */
CLScheduler();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLScheduler(const CLScheduler &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLScheduler &operator=(const CLScheduler &) = delete;
public:
/** Access the scheduler singleton.
@@ -50,11 +55,13 @@ public:
static CLScheduler &get();
/** Initialises the context and command queue used by the scheduler to default values
* and sets a default device and kernel path for the @ref CLKernelLibrary.
+ *
+ * @param[in] cl_tuner (Optional) Pointer to ICLTuner (default=nullptr)
*/
- void default_init()
+ void default_init(ICLTuner *cl_tuner = nullptr)
{
CLKernelLibrary::get().init("./cl_kernels/", cl::Context::getDefault(), cl::Device::getDefault());
- init(cl::Context::getDefault(), cl::CommandQueue::getDefault(), cl::Device::getDefault());
+ init(cl::Context::getDefault(), cl::CommandQueue::getDefault(), cl::Device::getDefault(), cl_tuner);
}
/** Schedule the execution of the passed kernel if possible.
*
@@ -65,17 +72,20 @@ public:
/** Initialises the context and command queue to be used by the scheduler.
*
- * @param[in] context A CL context.
- * @param[in] queue A CL command queue.
- * @param[in] device A CL device.
+ * @param[in] context A CL context.
+ * @param[in] queue A CL command queue.
+ * @param[in] device A CL device.
+ * @param[in] cl_tuner (Optional) Pointer to OpenCL tuner (default=nullptr)
+ * Note: It is caller's responsibility to release the allocated memory for CLTuner
*/
void init(cl::Context context = cl::Context::getDefault(), cl::CommandQueue queue = cl::CommandQueue::getDefault(),
- cl::Device device = cl::Device::getDefault())
+ cl::Device device = cl::Device::getDefault(), ICLTuner *cl_tuner = nullptr)
{
_context = std::move(context);
_queue = std::move(queue);
_target = get_target_from_device(device);
_is_initialised = true;
+ _cl_tuner = cl_tuner;
}
/** Accessor for the associated CL context.
@@ -153,10 +163,21 @@ public:
}
private:
+ /** Tune OpenCL kernel
+ *
+ * @note This method uses a brute force approach to find the optimal LWS
+ *
+ * @param[in] kernel Kernel to tune
+ *
+ * @return The optimal LWS for the specified kernel
+ */
+ cl::NDRange tune_kernel(ICLKernel &kernel);
+
cl::Context _context;
cl::CommandQueue _queue;
GPUTarget _target;
bool _is_initialised;
+ ICLTuner *_cl_tuner;
};
}
#endif /* __ARM_COMPUTE_CLSCHEDULER_H__ */
diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h
new file mode 100644
index 0000000000..8a7b96aa09
--- /dev/null
+++ b/arm_compute/runtime/CL/CLTuner.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLTUNER_H__
+#define __ARM_COMPUTE_CLTUNER_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/runtime/CL/ICLTuner.h"
+
+#include <unordered_map>
+
+namespace arm_compute
+{
+class ICLKernel;
+
+/** Basic implementation of the OpenCL tuner interface */
+class CLTuner : public ICLTuner
+{
+public:
+ /** Constructor */
+ CLTuner();
+
+ /** Destructor */
+ ~CLTuner() = default;
+
+ /** Import LWS table
+ *
+ * @param[in] lws_table The unordered_map container to import
+ */
+ void import_lws_table(const std::unordered_map<std::string, cl::NDRange> &lws_table);
+
+ /** Export LWS table
+ *
+ * return The lws table as unordered_map container
+ */
+ const std::unordered_map<std::string, cl::NDRange> &export_lws_table();
+
+ // Inherited methods overridden:
+ void tune_kernel(ICLKernel &kernel) override;
+
+private:
+ /** Find optimal LWS using brute-force approach
+ *
+ * @param[in] kernel OpenCL kernel to be tuned with LWS
+ *
+ * @return The optimal LWS to use
+ */
+ cl::NDRange find_optimal_lws(ICLKernel &kernel);
+
+ std::unordered_map<std::string, cl::NDRange> _lws_table;
+};
+}
+#endif /*__ARM_COMPUTE_CLTUNER_H__ */
diff --git a/arm_compute/runtime/CL/ICLTuner.h b/arm_compute/runtime/CL/ICLTuner.h
new file mode 100644
index 0000000000..c71835c0aa
--- /dev/null
+++ b/arm_compute/runtime/CL/ICLTuner.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLTUNER_H__
+#define __ARM_COMPUTE_ICLTUNER_H__
+
+namespace arm_compute
+{
+class ICLKernel;
+
+/** Basic interface for tuning the OpenCL kernels */
+class ICLTuner
+{
+public:
+ /** Virtual destructor */
+ virtual ~ICLTuner() = default;
+ /** Tune OpenCL kernel
+ *
+ * @param[in] kernel Kernel to tune
+ */
+ virtual void tune_kernel(ICLKernel &kernel) = 0;
+};
+}
+#endif /*__ARM_COMPUTE_ICLTUNER_H__ */
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 7a95374bbf..b0ac40adf7 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -60,7 +60,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind
}
ICLKernel::ICLKernel()
- : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD)
+ : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id("")
{
}
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 75e6d5e971..4224d9bb8e 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -230,6 +230,24 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
ICLKernel::configure(win);
}
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "direct_convolution_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(kernel_size);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(_conv_pad_x);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(_conv_pad_y);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(_conv_stride_x);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(_conv_stride_y);
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
}
void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
index 5b6e0ec6af..268260b8d5 100644
--- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
@@ -81,6 +81,14 @@ void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *out
output_access.set_valid_region(win, input->info()->valid_region());
ICLKernel::configure(win);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "interleave4x4_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
}
void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 684e3232d5..b184c507ff 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -157,6 +157,17 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
ICLKernel::configure(win);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "gemm_";
+ _config_id += (is_interleaved_transposed ? "reshaped_" : "");
+ _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
}
}
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 3d21a9e3c0..98a799f783 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -123,6 +123,15 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
}
ICLKernel::configure(win);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "im2col_";
+ _config_id += (run_img2col_reduced ? "reduced_" : "");
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
}
void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -160,9 +169,6 @@ void CLIm2ColKernel::run_generic(const Window &window, cl::CommandQueue &queue)
slice_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 1));
slice_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
- // Set the local-workgroup size
- _lws_hint = cl::NDRange(4, 4, 4);
-
do
{
unsigned int idx = 0;
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index f413f626eb..71a749fe52 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -24,11 +24,12 @@
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/CL/CLTuner.h"
using namespace arm_compute;
CLScheduler::CLScheduler()
- : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false)
+ : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner()
{
}
@@ -44,10 +45,18 @@ void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
"The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!");
+ // Tune the kernel if the CLTuner has been provided
+ if(_cl_tuner != nullptr)
+ {
+ // Tune the OpenCL kernel
+ _cl_tuner->tune_kernel(kernel);
+ }
+
+ // Run kernel
kernel.run(kernel.window(), _queue);
if(flush)
{
_queue.flush();
}
-}
+} \ No newline at end of file
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
new file mode 100644
index 0000000000..f3300d3f83
--- /dev/null
+++ b/src/runtime/CL/CLTuner.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLTuner.h"
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <chrono>
+#include <limits>
+#include <string>
+
+using namespace arm_compute;
+
+CLTuner::CLTuner()
+ : _lws_table()
+{
+}
+
+void CLTuner::tune_kernel(ICLKernel &kernel)
+{
+ // Get the configuration ID from the kernel
+ const std::string &config_id = kernel.config_id();
+
+ // Check if we need to find the Optimal LWS. If config_id is empty, the kernel does not require to be tuned
+ if(config_id != "")
+ {
+ auto p = _lws_table.find(config_id);
+
+ if(p == _lws_table.end())
+ {
+ // Find the optimal LWS for the kernel
+ cl::NDRange opt_lws = find_optimal_lws(kernel);
+
+ // Insert the optimal LWS in the table
+ _lws_table.emplace(config_id, opt_lws);
+
+ // Set Local-Workgroup-Size
+ kernel.set_lws_hint(opt_lws);
+ }
+ else
+ {
+ // Set Local-Workgroup-Size
+ kernel.set_lws_hint(p->second);
+ }
+ }
+}
+
+cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+
+ double min_exec_time = std::numeric_limits<double>::max();
+
+ cl::NDRange opt_lws = cl::NDRange(1, 1);
+
+ for(int y = 1; y <= 16; ++y)
+ {
+ for(int x = 1; x <= 16; ++x)
+ {
+ cl::NDRange lws_test = cl::NDRange(x, y);
+
+ //Set the Local-Workgroup-Size
+ kernel.set_lws_hint(lws_test);
+
+ auto t_start = std::chrono::high_resolution_clock::now();
+
+ // Run
+ kernel.run(kernel.window(), q);
+
+ CLScheduler::get().sync();
+
+ auto t_stop = std::chrono::high_resolution_clock::now();
+
+ std::chrono::duration<double, std::nano> fp_nano = t_stop - t_start;
+
+ // Check the execution time
+ if(fp_nano.count() < min_exec_time)
+ {
+ min_exec_time = fp_nano.count();
+ opt_lws = cl::NDRange(x, y);
+ }
+ }
+ }
+
+ return opt_lws;
+}
+
+void CLTuner::import_lws_table(const std::unordered_map<std::string, cl::NDRange> &lws_table)
+{
+ _lws_table.clear();
+ _lws_table = lws_table;
+}
+
+const std::unordered_map<std::string, cl::NDRange> &CLTuner::export_lws_table()
+{
+ return _lws_table;
+} \ No newline at end of file