From ded5b182675e3166e947a8eb637b5b1e925816ab Mon Sep 17 00:00:00 2001 From: David Svantesson Date: Wed, 2 Aug 2023 14:23:00 +0000 Subject: thread_local _custom_scheduler Resolves ONCPUML-1331 This patch adds an option to make _custom_scheduler thread_local to support usage of multiple schedulers handled outside of ACL. It also adds num_threads() function to Scheduler which reverts to querying CPUInfo if no scheduler has been set. Change-Id: Iff706165d8d091895331a5bb3a76f6cabe048912 Signed-off-by: David Svantesson-Yeung Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10748 Comments-Addressed: Arm Jenkins Reviewed-by: SiCong Li Tested-by: Arm Jenkins Benchmark: Arm Jenkins --- .bazelrc | 1 + BUILD.bazel | 16 +++++++++++ arm_compute/runtime/Scheduler.h | 26 ++++++++++++++---- cmake/Options.cmake | 6 +++- .../CpuDepthwiseConv2dAssemblyDispatch.cpp | 4 +-- src/cpu/operators/CpuPool2d.cpp | 4 +-- src/cpu/operators/CpuWinogradConv2d.cpp | 6 ++-- .../operators/internal/CpuGemmAssemblyDispatch.cpp | 24 ++++++++-------- src/runtime/Scheduler.cpp | 32 ++++++++++++++++++++-- 9 files changed, 90 insertions(+), 29 deletions(-) diff --git a/.bazelrc b/.bazelrc index 1dbbedc8bd..f74649d731 100644 --- a/.bazelrc +++ b/.bazelrc @@ -39,3 +39,4 @@ build --flag_alias=cppthreads=//:cppthreads build --flag_alias=enable_bf16_validation=//:enable_bf16_validation build --flag_alias=enable_sve_validation=//:enable_sve_validation build --flag_alias=arch=//:arch +build --flag_alias=thread_local_scheduler=//:thread_local_scheduler diff --git a/BUILD.bazel b/BUILD.bazel index 3a7d941a0e..50340c6c39 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -78,6 +78,12 @@ bool_flag( visibility = ["//visibility:public"], ) +bool_flag( + name = "thread_local_scheduler", + build_setting_default = False, + visibility = ["//visibility:public"], +) + string_flag( name = "arch", build_setting_default = "armv8-a", @@ -152,6 +158,12 @@ config_setting( } ) +config_setting( + name = "thread_local_scheduler_flag", + flag_values = { + ":thread_local_scheduler": "true", + }, +) #--------------------------------------------------------------------- # Common defines used for all targets @@ -196,6 +208,10 @@ cc_library( "//:arch_armv8-a": [], "//:arch_armv8.2-a+fp16": ["ENABLE_FP16_KERNELS", "ARM_COMPUTE_ENABLE_FP16"], "//conditions:default": [], + }) + + select({ + "//:thread_local_scheduler_flag": ["ARM_COMPUTE_THREAD_LOCAL_SCHEDULER"], + "//conditions:default": [], }), visibility = ["//visibility:public"], ) diff --git a/arm_compute/runtime/Scheduler.h b/arm_compute/runtime/Scheduler.h index bd29cbb31f..481e5e9b60 100644 --- a/arm_compute/runtime/Scheduler.h +++ b/arm_compute/runtime/Scheduler.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 Arm Limited. +* Copyright (c) 2017-2019, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_SCHEDULER_H -#define ARM_COMPUTE_SCHEDULER_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_SCHEDULER_H +#define ACL_ARM_COMPUTE_RUNTIME_SCHEDULER_H #include "arm_compute/runtime/IScheduler.h" @@ -72,13 +72,27 @@ public: * @return true if the given scheduler type is supported. False otherwise. */ static bool is_available(Type t); + /** Returns true if a scheduler has been set. + * + * @return true if a scheduler has been set. False otherwise. + */ + static bool is_set(); + /** Returns number of threads from scheduler if scheduler is set, otherwise queries CPUInfo. + * + * @return number of threads from scheduler if scheduler is set, otherwise queries CPUInfo. + */ + static unsigned int num_threads(); private: - static Type _scheduler_type; - static std::shared_ptr _custom_scheduler; + static Type _scheduler_type; +#ifndef ARM_COMPUTE_THREAD_LOCAL_SCHEDULER + static std::shared_ptr _custom_scheduler; +#else // ARM_COMPUTE_THREAD_LOCAL_SCHEDULER + static std::shared_ptr thread_local _custom_scheduler; +#endif // ARM_COMPUTE_THREAD_LOCAL_SCHEDULER static std::map> _schedulers; Scheduler(); }; } // namespace arm_compute -#endif /* ARM_COMPUTE_SCHEDULER_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_SCHEDULER_H diff --git a/cmake/Options.cmake b/cmake/Options.cmake index bc51cbbc0d..722c55c90a 100644 --- a/cmake/Options.cmake +++ b/cmake/Options.cmake @@ -65,6 +65,7 @@ option(ENABLE_NCHW_KERNELS "" ON) option(ARM_COMPUTE_GRAPH_ENABLED "" ON) option(ARM_COMPUTE_ENABLE_SVEF32MM "" ON) option(ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS "" ON) +option(ARM_COMPUTE_THREAD_LOCAL_SCHEDULER "" OFF) option(ENABLE_FP16_KERNELS "" OFF) option(ARM_COMPUTE_ENABLE_FP16 "" OFF) @@ -116,4 +117,7 @@ endif() if(ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS) add_definitions(-DARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS) endif() -add_definitions(-D_GLIBCXX_USE_NANOSLEEP) \ No newline at end of file +if(ARM_COMPUTE_THREAD_LOCAL_SCHEDULER) + add_definitions(-DARM_COMPUTE_THREAD_LOCAL_SCHEDULER) +endif() +add_definitions(-D_GLIBCXX_USE_NANOSLEEP) diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp index 8d3741de96..8507c59e6b 100644 --- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp +++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp @@ -60,8 +60,8 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src, const ConvolutionInfo &info) { ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, info); - const CPUInfo &ci = NEScheduler::get().cpu_info(); - const unsigned int num_threads = NEScheduler::get().num_threads(); + const CPUInfo &ci = CPUInfo::get(); + const unsigned int num_threads = NEScheduler::num_threads(); _pImpl->is_prepared = false; _pImpl->are_weights_const = weights->are_values_constant(); diff --git a/src/cpu/operators/CpuPool2d.cpp b/src/cpu/operators/CpuPool2d.cpp index b72bde6978..d00efd191d 100644 --- a/src/cpu/operators/CpuPool2d.cpp +++ b/src/cpu/operators/CpuPool2d.cpp @@ -69,8 +69,8 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer if (run_optimised) { - const CPUInfo &ci = NEScheduler::get().cpu_info(); - const unsigned int num_threads = NEScheduler::get().num_threads(); + const CPUInfo &ci = CPUInfo::get(); + const unsigned int num_threads = NEScheduler::num_threads(); auto pooling_wrapper = std::make_unique(); ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr); diff --git a/src/cpu/operators/CpuWinogradConv2d.cpp b/src/cpu/operators/CpuWinogradConv2d.cpp index e4bcdc0b64..1fb6d33a61 100644 --- a/src/cpu/operators/CpuWinogradConv2d.cpp +++ b/src/cpu/operators/CpuWinogradConv2d.cpp @@ -103,7 +103,7 @@ bool get_winograd_kernel_implementation(const ITensorInfo Tensor4DShape in_shape{internal_get_shape(src)}; Tensor4DShape out_shape{internal_get_shape(dst)}; Tensor4DShape kernel_shape{internal_get_shape(weights)}; - uint32_t nthreads = NEScheduler::get().num_threads(); + uint32_t nthreads = NEScheduler::num_threads(); // Get configuration arguments for Winograd winograd_cfg.output_rows = 0; winograd_cfg.output_cols = 0; @@ -183,7 +183,7 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info, enable_fast_math); ARM_COMPUTE_UNUSED(biases); const DataType data_type = src->data_type(); - uint32_t nthreads = NEScheduler::get().num_threads(); + uint32_t nthreads = NEScheduler::num_threads(); _data_layout = src->data_layout(); const Tensor4DShape kernel_shape{internal_get_shape(weights)}; @@ -361,7 +361,7 @@ void CpuWinogradConv2d::run(ITensorPack &tensors) auto output = tensors.get_tensor(ACL_DST); Window win; - const uint32_t nthreads = NEScheduler::get().num_threads(); + const uint32_t nthreads = NEScheduler::num_threads(); // The Winograd transform implementation does fine-grain threading inside the transforms. Just pass thread_id and nthreads. win.set(Window::DimX, Window::Dimension(0, nthreads, 1)); diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp index 611bc76463..7f851aa755 100644 --- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -579,9 +579,8 @@ void Fallback::prepare(ITensorPack &tensors) CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false); ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr); - run_parallel_pretranspose_B_array(_gemm_kernel_asm.get(), pretranspose.get(), - in1_ptr, ldb, multi_stride_b, - NEScheduler::get().num_threads()); + run_parallel_pretranspose_B_array( + _gemm_kernel_asm.get(), pretranspose.get(), in1_ptr, ldb, multi_stride_b, NEScheduler::num_threads()); b->mark_as_unused(); // Note that we don't need to mark b_to_use as unused, as if it's been assigned to pre_pretransposed_b, its memory will be auto-managed by the handler @@ -691,9 +690,8 @@ void Fallback::run(ITensorPack &tensors) } else { - run_parallel_pretranspose_B_array(_gemm_kernel_asm.get(), pretranspose.get(), - b_ptr, ldb, multi_stride_b, - NEScheduler::get().num_threads()); + run_parallel_pretranspose_B_array( + _gemm_kernel_asm.get(), pretranspose.get(), b_ptr, ldb, multi_stride_b, NEScheduler::num_threads()); } } } @@ -707,7 +705,7 @@ void Fallback::run(ITensorPack &tensors) _gemm_kernel_asm->set_working_space(reinterpret_cast(workspace.get()->buffer())); const unsigned int split_dim = scheduling_hint.split_dimension(); const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size(); - unsigned int num_threads = NEScheduler::get().num_threads(); + unsigned int num_threads = NEScheduler::num_threads(); if (window_size < num_threads) { num_threads = window_size; @@ -756,8 +754,8 @@ void create_arm_gemm(std::unique_ptr &arm_ge const AsmGemmInfo &info) { Params p = extract_parameters(a, b, d, info); - const CPUInfo &ci = NEScheduler::get().cpu_info(); - unsigned int num_threads = NEScheduler::get().num_threads(); + const CPUInfo &ci = CPUInfo::get(); + unsigned int num_threads = NEScheduler::num_threads(); arm_gemm::GemmConfig cfg; cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); @@ -781,8 +779,8 @@ void create_arm_gemm_quant(std::unique_ptr & { ARM_COMPUTE_UNUSED(activation); Params p = extract_parameters(a, b, d, info); - const CPUInfo &ci = NEScheduler::get().cpu_info(); - const unsigned int num_threads = NEScheduler::get().num_threads(); + const CPUInfo &ci = CPUInfo::get(); + const unsigned int num_threads = NEScheduler::num_threads(); arm_gemm::GemmConfig cfg; cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); @@ -836,8 +834,8 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected ARM_COMPUTE_UNUSED(c); arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info); Params p = extract_parameters(a, b, d, info); - const CPUInfo &ci = NEScheduler::get().cpu_info(); - unsigned int num_threads = NEScheduler::get().num_threads(); + const CPUInfo &ci = CPUInfo::get(); + unsigned int num_threads = NEScheduler::num_threads(); arm_gemm::GemmConfig cfg; cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format); diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp index e52fb59940..6d961f29a5 100644 --- a/src/runtime/Scheduler.cpp +++ b/src/runtime/Scheduler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. +* Copyright (c) 2017-2020, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -40,14 +40,18 @@ using namespace arm_compute; #if !ARM_COMPUTE_CPP_SCHEDULER && ARM_COMPUTE_OPENMP_SCHEDULER Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::OMP; #elif ARM_COMPUTE_CPP_SCHEDULER && !ARM_COMPUTE_OPENMP_SCHEDULER -Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP; +Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP; #elif ARM_COMPUTE_CPP_SCHEDULER && ARM_COMPUTE_OPENMP_SCHEDULER Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP; #else /* ARM_COMPUTE_*_SCHEDULER */ Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::ST; #endif /* ARM_COMPUTE_*_SCHEDULER */ +#ifndef ARM_COMPUTE_THREAD_LOCAL_SCHEDULER std::shared_ptr Scheduler::_custom_scheduler = nullptr; +#else // ARM_COMPUTE_THREAD_LOCAL_SCHEDULER +std::shared_ptr thread_local Scheduler::_custom_scheduler = nullptr; +#endif // ARM_COMPUTE_THREAD_LOCAL_SCHEDULER namespace { @@ -74,6 +78,30 @@ void Scheduler::set(Type t) _scheduler_type = t; } +bool Scheduler::is_set() +{ + if (_scheduler_type == Type::CUSTOM) + { + return _custom_scheduler != nullptr; + } + else + { + return !_schedulers.empty(); + } +} + +unsigned int Scheduler::num_threads() +{ + if (Scheduler::is_set()) + { + return Scheduler::get().num_threads(); + } + else + { + return CPUInfo::get().get_cpu_num(); + } +} + bool Scheduler::is_available(Type t) { if (t == Type::CUSTOM) -- cgit v1.2.1