diff options
Diffstat (limited to 'src/runtime/OMP')
-rw-r--r-- | src/runtime/OMP/OMPScheduler.cpp | 57 |
1 files changed, 42 insertions, 15 deletions
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp index a8bd5a0d60..2a5abb5f7a 100644 --- a/src/runtime/OMP/OMPScheduler.cpp +++ b/src/runtime/OMP/OMPScheduler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,15 +27,29 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" -#include "src/runtime/CPUUtils.h" + #include <omp.h> namespace arm_compute { +#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__) +OMPScheduler::OMPScheduler() // NOLINT + : _num_threads(cpu_info().get_cpu_num_excluding_little()), + _has_lmb(cpu_info().cpu_has_little_mid_big()), + _nonlittle_num_cpus(cpu_info().get_cpu_num_excluding_little()) +{ +} +#else /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/ OMPScheduler::OMPScheduler() // NOLINT - : _num_threads(omp_get_max_threads()) + : _num_threads(omp_get_max_threads()), + _has_lmb(cpu_info().cpu_has_little_mid_big()), + _nonlittle_num_cpus(cpu_info().get_cpu_num_excluding_little()) { } +#endif /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/ unsigned int OMPScheduler::num_threads() const { @@ -45,7 +59,15 @@ unsigned int OMPScheduler::num_threads() const void OMPScheduler::set_num_threads(unsigned int num_threads) { const unsigned int num_cores = omp_get_max_threads(); - _num_threads = (num_threads == 0) ? num_cores : num_threads; +#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__) + const unsigned int adjusted_num_threads = (_has_lmb) ? _nonlittle_num_cpus : num_threads; + _num_threads = (num_threads == 0) ? num_cores : adjusted_num_threads; +#else /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/ + _num_threads = (num_threads == 0) ? num_cores : num_threads; +#endif /* !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \ + (defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)*/ } void OMPScheduler::schedule(ICPPKernel *kernel, const Hints &hints) @@ -64,20 +86,20 @@ void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Win const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension()); const unsigned int num_threads = std::min(num_iterations, _num_threads); - if(!kernel->is_parallelisable() || num_threads == 1) + if (!kernel->is_parallelisable() || num_threads == 1) { ThreadInfo info; - info.cpu_info = &_cpu_info; + info.cpu_info = &cpu_info(); kernel->run_op(tensors, max_window, info); } else { const unsigned int num_windows = num_threads; std::vector<IScheduler::Workload> workloads(num_windows); - for(unsigned int t = 0; t < num_windows; t++) + for (unsigned int t = 0; t < num_windows; t++) { //Capture 't' by copy, all the other variables by reference: - workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info) + workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo &info) { Window win = max_window.split_window(hints.split_dimension(), t, num_windows); win.validate(); @@ -90,20 +112,25 @@ void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Win #ifndef DOXYGEN_SKIP_THIS void OMPScheduler::run_workloads(std::vector<arm_compute::IScheduler::Workload> &workloads) { - const unsigned int num_threads = std::min(_num_threads, static_cast<unsigned int>(workloads.size())); - if(num_threads < 1) + const unsigned int amount_of_work = static_cast<unsigned int>(workloads.size()); + const unsigned int num_threads_to_use = std::min(_num_threads, amount_of_work); + + if (num_threads_to_use < 1) { return; } ThreadInfo info; - info.cpu_info = &_cpu_info; - info.num_threads = num_threads; - #pragma omp parallel firstprivate(info) num_threads(num_threads) + info.cpu_info = &cpu_info(); + info.num_threads = num_threads_to_use; +#pragma omp parallel for firstprivate(info) num_threads(num_threads_to_use) default(shared) proc_bind(close) \ + schedule(static, 1) + for (unsigned int wid = 0; wid < amount_of_work; ++wid) { - const int tid = omp_get_thread_num(); + const int tid = omp_get_thread_num(); + info.thread_id = tid; - workloads[tid](info); + workloads[wid](info); } } #endif /* DOXYGEN_SKIP_THIS */ |