aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--docs/user_guide/release_version_and_change_log.dox6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp30
-rw-r--r--src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp8
3 files changed, 35 insertions, 9 deletions
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 40ad09fd84..b1a4395eb5 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -1,5 +1,5 @@
///
-/// Copyright (c) 2017-2023 Arm Limited.
+/// Copyright (c) 2017-2024 Arm Limited.
///
/// SPDX-License-Identifier: MIT
///
@@ -41,6 +41,10 @@ If there is more than one release in a month then an extra sequential number is
@section S2_2_changelog Changelog
+v24.02 Public major release
+ - Performance optimizations:
+ - Parallelize @ref NEDepthwiseConvolutionLayer over batches if there is only 1 row
+
v24.01 Public major release
- Remove the legacy 'libarm_compute_core' library. This library is an artifact of Compute Library's legacy library architecture and no longer serves any purpose.
You should link only to the main `libarm_compute` library for core functionality.
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
index 592ee72820..95ece8cdc8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -173,12 +173,30 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
const auto n_output_channels = args.input_channels * args.channel_multiplier;
- for (unsigned int batch = 0; batch < args.n_batches; batch++)
+ // By default we parallelize over the rows, but if there's only 1 row, we
+ // try to parallize over batches
+ auto thread_id_for_rows = thread_id;
+ auto n_threads_for_rows = n_threads;
+ auto thread_id_for_batches = 0;
+ auto n_threads_for_batches = 1;
+ if (args.output_rows == 1) {
+ thread_id_for_rows = 0;
+ n_threads_for_rows = 1;
+ thread_id_for_batches = thread_id;
+ n_threads_for_batches = n_threads;
+ }
+
+ // Progress the pointers for the first batch.
+ input_tensor.base += ld_input_batch*thread_id_for_batches;
+ output_tensor.base += ld_output_batch*thread_id_for_batches;
+ for (unsigned int batch = thread_id_for_batches;
+ batch < args.n_batches;
+ batch += n_threads_for_batches)
{
// Iterate over rows of the output tensor; we stripe over the tiles.
- for (unsigned int start_output_i = thread_id * m_strat->get_output_rows();
+ for (unsigned int start_output_i = thread_id_for_rows * m_strat->get_output_rows();
start_output_i < args.output_rows;
- start_output_i += n_threads * m_strat->get_output_rows())
+ start_output_i += n_threads_for_rows * m_strat->get_output_rows())
{
// Determine what (if any padding) is required on the top/bottom of
// this row of the convolution.
@@ -264,8 +282,8 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
}
// Progress the pointers for the next batch.
- input_tensor.base += ld_input_batch;
- output_tensor.base += ld_output_batch;
+ input_tensor.base += ld_input_batch*n_threads_for_batches;
+ output_tensor.base += ld_output_batch*n_threads_for_batches;
}
}
diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
index 8d3741de96..38092adfee 100644
--- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2023 Arm Limited.
+ * Copyright (c) 2019-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -108,7 +108,11 @@ void CpuDepthwiseConv2dAssemblyDispatch::run(ITensorPack &tensors)
prepare(tensors);
- NEScheduler::get().schedule_op(_pImpl->asm_kernel.get(), Window::DimY, _pImpl->asm_kernel->window(), tensors);
+ // Split over rows (z) if there's more than 1, otherwise batches (w). This logic
+ // corresponds to the threading strategy in DepthFirstDriver::execute_internal
+ auto split_dimension = _pImpl->asm_kernel->window().num_iterations(Window::DimZ) == 1 ? Window::DimZ : Window::DimW;
+
+ NEScheduler::get().schedule_op(_pImpl->asm_kernel.get(), split_dimension, _pImpl->asm_kernel->window(), tensors);
}
void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors)