From 5111264954e2d1a4d3e91d23a0869a0d7105be4c Mon Sep 17 00:00:00 2001 From: morgolock Date: Thu, 20 Aug 2020 14:51:39 +0100 Subject: COMPMID-3661: Added multidimension support to OMP scheduler. Change-Id: Iedacf7094896f08d7c2847c8fb99bd7153deba2c Signed-off-by: morgolock Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3809 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Sang-Hoon Park --- src/runtime/IScheduler.cpp | 117 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) (limited to 'src/runtime/IScheduler.cpp') diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp index 6b961d7dfc..53df3699b0 100644 --- a/src/runtime/IScheduler.cpp +++ b/src/runtime/IScheduler.cpp @@ -23,8 +23,11 @@ */ #include "arm_compute/runtime/IScheduler.h" +#include "arm_compute/core/CPP/ICPPKernel.h" #include "arm_compute/core/Error.h" +#include "arm_compute/core/Window.h" #include "arm_compute/runtime/CPUUtils.h" +#include "arm_compute/runtime/SchedulerUtils.h" namespace arm_compute { @@ -51,6 +54,120 @@ unsigned int IScheduler::num_threads_hint() const { return _num_threads_hint; } + +void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel"); + ARM_COMPUTE_UNUSED(kernel); + ARM_COMPUTE_UNUSED(hints); + ARM_COMPUTE_UNUSED(tensors); +#ifndef BARE_METAL + const Window &max_window = kernel->window(); + if(hints.split_dimension() == IScheduler::split_dimensions_all) + { + /* + * if the split dim is size_t max then this signals we should parallelise over + * all dimensions + */ + const std::size_t m = max_window.num_iterations(Window::DimX); + const std::size_t n = max_window.num_iterations(Window::DimY); + + //in c++17 this can be swapped for auto [ m_threads, n_threads ] = split_2d(... + unsigned m_threads, n_threads; + std::tie(m_threads, n_threads) = split_2d(this->num_threads(), m, n); + + std::vector workloads; + for(unsigned int ni = 0; ni != n_threads; ++ni) + { + for(unsigned int mi = 0; mi != m_threads; ++mi) + { + workloads.push_back( + [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo & info) + { + //narrow the window to our mi-ni workload + Window win = max_window.split_window(Window::DimX, mi, m_threads) + .split_window(Window::DimY, ni, n_threads); + + win.validate(); + + Window thread_locator; + thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads)); + thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads)); + + thread_locator.validate(); + + kernel->run_nd(win, info, thread_locator); + }); + } + } + run_workloads(workloads); + } + else + { + const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); + const unsigned int num_threads = std::min(num_iterations, this->num_threads()); + + if(num_iterations == 0) + { + return; + } + + if(!kernel->is_parallelisable() || num_threads == 1) + { + ThreadInfo info; + info.cpu_info = &_cpu_info; + if(tensors.empty()) + { + kernel->run(max_window, info); + } + else + { + kernel->run_op(tensors, max_window, info); + } + } + else + { + unsigned int num_windows = 0; + switch(hints.strategy()) + { + case StrategyHint::STATIC: + num_windows = num_threads; + break; + case StrategyHint::DYNAMIC: + { + const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast(hints.threshold()); + // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder + num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations; + break; + } + default: + ARM_COMPUTE_ERROR("Unknown strategy"); + } + std::vector workloads(num_windows); + for(unsigned int t = 0; t < num_windows; ++t) + { + //Capture 't' by copy, all the other variables by reference: + workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info) + { + Window win = max_window.split_window(hints.split_dimension(), t, num_windows); + win.validate(); + + if(tensors.empty()) + { + kernel->run(win, info); + } + else + { + kernel->run_op(tensors, win, info); + } + }; + } + run_workloads(workloads); + } + } +#endif /* !BARE_METAL */ +} + void IScheduler::run_tagged_workloads(std::vector &workloads, const char *tag) { ARM_COMPUTE_UNUSED(tag); -- cgit v1.2.1