From 5111264954e2d1a4d3e91d23a0869a0d7105be4c Mon Sep 17 00:00:00 2001 From: morgolock Date: Thu, 20 Aug 2020 14:51:39 +0100 Subject: COMPMID-3661: Added multidimension support to OMP scheduler. Change-Id: Iedacf7094896f08d7c2847c8fb99bd7153deba2c Signed-off-by: morgolock Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3809 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Sang-Hoon Park --- Android.bp | 1 + arm_compute/runtime/CPP/CPPScheduler.h | 1 - arm_compute/runtime/IScheduler.h | 2 + arm_compute/runtime/SchedulerUtils.h | 39 ++++++++ src/runtime/CPP/CPPScheduler.cpp | 165 --------------------------------- src/runtime/IScheduler.cpp | 117 +++++++++++++++++++++++ src/runtime/OMP/OMPScheduler.cpp | 33 +------ src/runtime/SchedulerUtils.cpp | 79 ++++++++++++++++ 8 files changed, 240 insertions(+), 197 deletions(-) create mode 100644 arm_compute/runtime/SchedulerUtils.h create mode 100644 src/runtime/SchedulerUtils.cpp diff --git a/Android.bp b/Android.bp index d033d2d04e..5bf75500ea 100644 --- a/Android.bp +++ b/Android.bp @@ -751,6 +751,7 @@ cc_library_static { "src/runtime/RuntimeContext.cpp", "src/runtime/Scheduler.cpp", "src/runtime/SchedulerFactory.cpp", + "src/runtime/SchedulerUtils.cpp", "src/runtime/SubTensor.cpp", "src/runtime/Tensor.cpp", "src/runtime/TensorAllocator.cpp", diff --git a/arm_compute/runtime/CPP/CPPScheduler.h b/arm_compute/runtime/CPP/CPPScheduler.h index e8ad427eba..764af818d9 100644 --- a/arm_compute/runtime/CPP/CPPScheduler.h +++ b/arm_compute/runtime/CPP/CPPScheduler.h @@ -62,7 +62,6 @@ protected: void run_workloads(std::vector &workloads) override; private: - void schedule_common(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors); struct Impl; std::unique_ptr _impl; }; diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h index 98627538e8..309aee3bb5 100644 --- a/arm_compute/runtime/IScheduler.h +++ b/arm_compute/runtime/IScheduler.h @@ -205,6 +205,8 @@ protected: virtual void run_workloads(std::vector &workloads) = 0; CPUInfo _cpu_info; + void schedule_common(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors); + private: unsigned int _num_threads_hint = {}; }; diff --git a/arm_compute/runtime/SchedulerUtils.h b/arm_compute/runtime/SchedulerUtils.h new file mode 100644 index 0000000000..5a88e039cc --- /dev/null +++ b/arm_compute/runtime/SchedulerUtils.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_SCHEDULER_UTILS_H +#define ARM_COMPUTE_SCHEDULER_UTILS_H + +namespace arm_compute +{ +/** Given two dimensions and a maxium number of threads to utilise, calculate the best + * combination of threads that fit in (mutliplied together) max_threads. + * + * This algorithm assumes that work in either of the dimensions is equally difficult + * to compute + * + * @returns [m_nthreads, n_nthreads] A pair of the threads that should be used in each dimension + */ +std::pair split_2d(unsigned max_threads, std::size_t m, std::size_t n); +} // namespace arm_compute +#endif /* ARM_COMPUTE_SCHEDULER_UTILS_H */ diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp index 55f62c1387..f017006de7 100644 --- a/src/runtime/CPP/CPPScheduler.cpp +++ b/src/runtime/CPP/CPPScheduler.cpp @@ -71,61 +71,6 @@ private: const unsigned int _end; }; -/** Given two dimensions and a maxium number of threads to utilise, calcualte the best - * combination of threads that fit in (mutliplied together) max_threads. - * - * This algorithm assumes that work in either of the dimensions is equally difficult - * to compute - * - * @returns [m_nthreads, n_nthreads] A pair of the threads that should be used in each dimension - */ -std::pair split_2d(unsigned max_threads, std::size_t m, std::size_t n) -{ - /* - * We want the same ratio of threads in M & N to the ratio of m and n problem size - * - * Therefore: mt/nt == m/n where mt*nt == max_threads - * - * max_threads/nt = mt & (max_threads/nt) * (m/n) = nt - * nt^2 = max_threads * (m/n) - * nt = sqrt( max_threads * (m/n) ) - */ - //ratio of m to n in problem dimensions - double ratio = m / static_cast(n); - - // nt = sqrt(max_threads * (m / n) ) - const unsigned adjusted = std::round( - std::sqrt(max_threads * ratio)); - - //find the nearest factor of max_threads - for(unsigned i = 0; i != adjusted; ++i) - { - //try down - const unsigned adj_down = adjusted - i; - if(max_threads % adj_down == 0) - { - return { adj_down, max_threads / adj_down }; - } - - //try up - const unsigned adj_up = adjusted + i; - if(max_threads % adj_up == 0) - { - return { adj_up, max_threads / adj_up }; - } - } - - //we didn't find anything so lets bail out with maxes biased to the largest dimension - if(m > n) - { - return { std::min(m, max_threads), 1 }; - } - else - { - return { 1, std::min(n, max_threads) }; - } -} - /** Execute workloads[info.thread_id] first, then call the feeder to get the index of the next workload to run. * * Will run workloads until the feeder reaches the end of its range. @@ -405,116 +350,6 @@ void CPPScheduler::run_workloads(std::vector &workloads) } #endif /* DOXYGEN_SKIP_THIS */ -void CPPScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel"); - - const Window &max_window = kernel->window(); - - if(hints.split_dimension() == IScheduler::split_dimensions_all) - { - /* - * if the split dim is size_t max then this signals we should parallelise over - * all dimensions - */ - const std::size_t m = max_window.num_iterations(Window::DimX); - const std::size_t n = max_window.num_iterations(Window::DimY); - - //in c++17 this can be swapped for auto [ m_threads, n_threads ] = split_2d(... - unsigned m_threads, n_threads; - std::tie(m_threads, n_threads) = split_2d(_impl->_num_threads, m, n); - - std::vector workloads; - for(unsigned int ni = 0; ni != n_threads; ++ni) - { - for(unsigned int mi = 0; mi != m_threads; ++mi) - { - workloads.push_back( - [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo & info) - { - //narrow the window to our mi-ni workload - Window win = max_window.split_window(Window::DimX, mi, m_threads) - .split_window(Window::DimY, ni, n_threads); - - win.validate(); - - Window thread_locator; - thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads)); - thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads)); - - thread_locator.validate(); - - kernel->run_nd(win, info, thread_locator); - }); - } - } - run_workloads(workloads); - } - else - { - const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); - const unsigned int num_threads = std::min(num_iterations, _impl->_num_threads); - - if(num_iterations == 0) - { - return; - } - - if(!kernel->is_parallelisable() || num_threads == 1) - { - ThreadInfo info; - info.cpu_info = &_cpu_info; - if(tensors.empty()) - { - kernel->run(max_window, info); - } - else - { - kernel->run_op(tensors, max_window, info); - } - } - else - { - unsigned int num_windows = 0; - switch(hints.strategy()) - { - case StrategyHint::STATIC: - num_windows = num_threads; - break; - case StrategyHint::DYNAMIC: - { - const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast(hints.threshold()); - // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder - num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations; - break; - } - default: - ARM_COMPUTE_ERROR("Unknown strategy"); - } - std::vector workloads(num_windows); - for(unsigned int t = 0; t < num_windows; t++) - { - //Capture 't' by copy, all the other variables by reference: - workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info) - { - Window win = max_window.split_window(hints.split_dimension(), t, num_windows); - win.validate(); - - if(tensors.empty()) - { - kernel->run(win, info); - } - else - { - kernel->run_op(tensors, win, info); - } - }; - } - run_workloads(workloads); - } - } -} - void CPPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors) { schedule_common(kernel, hints, tensors); diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp index 6b961d7dfc..53df3699b0 100644 --- a/src/runtime/IScheduler.cpp +++ b/src/runtime/IScheduler.cpp @@ -23,8 +23,11 @@ */ #include "arm_compute/runtime/IScheduler.h" +#include "arm_compute/core/CPP/ICPPKernel.h" #include "arm_compute/core/Error.h" +#include "arm_compute/core/Window.h" #include "arm_compute/runtime/CPUUtils.h" +#include "arm_compute/runtime/SchedulerUtils.h" namespace arm_compute { @@ -51,6 +54,120 @@ unsigned int IScheduler::num_threads_hint() const { return _num_threads_hint; } + +void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel"); + ARM_COMPUTE_UNUSED(kernel); + ARM_COMPUTE_UNUSED(hints); + ARM_COMPUTE_UNUSED(tensors); +#ifndef BARE_METAL + const Window &max_window = kernel->window(); + if(hints.split_dimension() == IScheduler::split_dimensions_all) + { + /* + * if the split dim is size_t max then this signals we should parallelise over + * all dimensions + */ + const std::size_t m = max_window.num_iterations(Window::DimX); + const std::size_t n = max_window.num_iterations(Window::DimY); + + //in c++17 this can be swapped for auto [ m_threads, n_threads ] = split_2d(... + unsigned m_threads, n_threads; + std::tie(m_threads, n_threads) = split_2d(this->num_threads(), m, n); + + std::vector workloads; + for(unsigned int ni = 0; ni != n_threads; ++ni) + { + for(unsigned int mi = 0; mi != m_threads; ++mi) + { + workloads.push_back( + [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo & info) + { + //narrow the window to our mi-ni workload + Window win = max_window.split_window(Window::DimX, mi, m_threads) + .split_window(Window::DimY, ni, n_threads); + + win.validate(); + + Window thread_locator; + thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads)); + thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads)); + + thread_locator.validate(); + + kernel->run_nd(win, info, thread_locator); + }); + } + } + run_workloads(workloads); + } + else + { + const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); + const unsigned int num_threads = std::min(num_iterations, this->num_threads()); + + if(num_iterations == 0) + { + return; + } + + if(!kernel->is_parallelisable() || num_threads == 1) + { + ThreadInfo info; + info.cpu_info = &_cpu_info; + if(tensors.empty()) + { + kernel->run(max_window, info); + } + else + { + kernel->run_op(tensors, max_window, info); + } + } + else + { + unsigned int num_windows = 0; + switch(hints.strategy()) + { + case StrategyHint::STATIC: + num_windows = num_threads; + break; + case StrategyHint::DYNAMIC: + { + const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast(hints.threshold()); + // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder + num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations; + break; + } + default: + ARM_COMPUTE_ERROR("Unknown strategy"); + } + std::vector workloads(num_windows); + for(unsigned int t = 0; t < num_windows; ++t) + { + //Capture 't' by copy, all the other variables by reference: + workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info) + { + Window win = max_window.split_window(hints.split_dimension(), t, num_windows); + win.validate(); + + if(tensors.empty()) + { + kernel->run(win, info); + } + else + { + kernel->run_op(tensors, win, info); + } + }; + } + run_workloads(workloads); + } + } +#endif /* !BARE_METAL */ +} + void IScheduler::run_tagged_workloads(std::vector &workloads, const char *tag) { ARM_COMPUTE_UNUSED(tag); diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp index 11448e595c..4c2f03a53a 100644 --- a/src/runtime/OMP/OMPScheduler.cpp +++ b/src/runtime/OMP/OMPScheduler.cpp @@ -28,7 +28,6 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" #include "arm_compute/runtime/CPUUtils.h" - #include namespace arm_compute @@ -51,36 +50,8 @@ void OMPScheduler::set_num_threads(unsigned int num_threads) void OMPScheduler::schedule(ICPPKernel *kernel, const Hints &hints) { - ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel"); - ARM_COMPUTE_ERROR_ON_MSG(hints.strategy() == StrategyHint::DYNAMIC, - "Dynamic scheduling is not supported in OMPScheduler"); - - const Window &max_window = kernel->window(); - const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); - const unsigned int num_threads = std::min(num_iterations, _num_threads); - - if(!kernel->is_parallelisable() || num_threads == 1) - { - ThreadInfo info; - info.cpu_info = &_cpu_info; - kernel->run(max_window, info); - } - else - { - const unsigned int num_windows = num_threads; - std::vector workloads(num_windows); - for(unsigned int t = 0; t < num_windows; t++) - { - //Capture 't' by copy, all the other variables by reference: - workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info) - { - Window win = max_window.split_window(hints.split_dimension(), t, num_windows); - win.validate(); - kernel->run(win, info); - }; - } - run_workloads(workloads); - } + ITensorPack tensors; + schedule_common(kernel, hints, tensors); } void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors) diff --git a/src/runtime/SchedulerUtils.cpp b/src/runtime/SchedulerUtils.cpp new file mode 100644 index 0000000000..1c12e3ce58 --- /dev/null +++ b/src/runtime/SchedulerUtils.cpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Error.h" + +#include + +namespace arm_compute +{ +#ifndef BARE_METAL +std::pair split_2d(unsigned max_threads, std::size_t m, std::size_t n) +{ + /* + * We want the same ratio of threads in M & N to the ratio of m and n problem size + * + * Therefore: mt/nt == m/n where mt*nt == max_threads + * + * max_threads/nt = mt & (max_threads/nt) * (m/n) = nt + * nt^2 = max_threads * (m/n) + * nt = sqrt( max_threads * (m/n) ) + */ + //ratio of m to n in problem dimensions + double ratio = m / static_cast(n); + + // nt = sqrt(max_threads * (m / n) ) + const unsigned adjusted = std::round( + std::sqrt(max_threads * ratio)); + + //find the nearest factor of max_threads + for(unsigned i = 0; i != adjusted; ++i) + { + //try down + const unsigned adj_down = adjusted - i; + if(max_threads % adj_down == 0) + { + return { adj_down, max_threads / adj_down }; + } + + //try up + const unsigned adj_up = adjusted + i; + if(max_threads % adj_up == 0) + { + return { adj_up, max_threads / adj_up }; + } + } + + //we didn't find anything so lets bail out with maxes biased to the largest dimension + if(m > n) + { + return { std::min(m, max_threads), 1 }; + } + else + { + return { 1, std::min(n, max_threads) }; + } +} +#endif /* #ifndef BARE_METAL */ +} // namespace arm_compute -- cgit v1.2.1