diff options
3 files changed, 35 insertions, 9 deletions
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox index 40ad09fd84..b1a4395eb5 100644 --- a/docs/user_guide/release_version_and_change_log.dox +++ b/docs/user_guide/release_version_and_change_log.dox @@ -1,5 +1,5 @@ /// -/// Copyright (c) 2017-2023 Arm Limited. +/// Copyright (c) 2017-2024 Arm Limited. /// /// SPDX-License-Identifier: MIT /// @@ -41,6 +41,10 @@ If there is more than one release in a month then an extra sequential number is @section S2_2_changelog Changelog +v24.02 Public major release + - Performance optimizations: + - Parallelize @ref NEDepthwiseConvolutionLayer over batches if there is only 1 row + v24.01 Public major release - Remove the legacy 'libarm_compute_core' library. This library is an artifact of Compute Library's legacy library architecture and no longer serves any purpose. You should link only to the main `libarm_compute` library for core functionality. diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp index 592ee72820..95ece8cdc8 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023 Arm Limited. + * Copyright (c) 2022-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -173,12 +173,30 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput> const auto n_output_channels = args.input_channels * args.channel_multiplier; - for (unsigned int batch = 0; batch < args.n_batches; batch++) + // By default we parallelize over the rows, but if there's only 1 row, we + // try to parallize over batches + auto thread_id_for_rows = thread_id; + auto n_threads_for_rows = n_threads; + auto thread_id_for_batches = 0; + auto n_threads_for_batches = 1; + if (args.output_rows == 1) { + thread_id_for_rows = 0; + n_threads_for_rows = 1; + thread_id_for_batches = thread_id; + n_threads_for_batches = n_threads; + } + + // Progress the pointers for the first batch. + input_tensor.base += ld_input_batch*thread_id_for_batches; + output_tensor.base += ld_output_batch*thread_id_for_batches; + for (unsigned int batch = thread_id_for_batches; + batch < args.n_batches; + batch += n_threads_for_batches) { // Iterate over rows of the output tensor; we stripe over the tiles. - for (unsigned int start_output_i = thread_id * m_strat->get_output_rows(); + for (unsigned int start_output_i = thread_id_for_rows * m_strat->get_output_rows(); start_output_i < args.output_rows; - start_output_i += n_threads * m_strat->get_output_rows()) + start_output_i += n_threads_for_rows * m_strat->get_output_rows()) { // Determine what (if any padding) is required on the top/bottom of // this row of the convolution. @@ -264,8 +282,8 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput> } // Progress the pointers for the next batch. - input_tensor.base += ld_input_batch; - output_tensor.base += ld_output_batch; + input_tensor.base += ld_input_batch*n_threads_for_batches; + output_tensor.base += ld_output_batch*n_threads_for_batches; } } diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp index 8d3741de96..38092adfee 100644 --- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp +++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023 Arm Limited. + * Copyright (c) 2019-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -108,7 +108,11 @@ void CpuDepthwiseConv2dAssemblyDispatch::run(ITensorPack &tensors) prepare(tensors); - NEScheduler::get().schedule_op(_pImpl->asm_kernel.get(), Window::DimY, _pImpl->asm_kernel->window(), tensors); + // Split over rows (z) if there's more than 1, otherwise batches (w). This logic + // corresponds to the threading strategy in DepthFirstDriver::execute_internal + auto split_dimension = _pImpl->asm_kernel->window().num_iterations(Window::DimZ) == 1 ? Window::DimZ : Window::DimW; + + NEScheduler::get().schedule_op(_pImpl->asm_kernel.get(), split_dimension, _pImpl->asm_kernel->window(), tensors); } void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors) |