3 files changed, 35 insertions, 9 deletions
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 40ad09fd84..b1a4395eb5 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2017-2023 Arm Limited.
+/// Copyright (c) 2017-2024 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -41,6 +41,10 @@ If there is more than one release in a month then an extra sequential number is
 
 @section S2_2_changelog Changelog
 
+v24.02 Public major release
+ - Performance optimizations:
+   - Parallelize @ref NEDepthwiseConvolutionLayer over batches if there is only 1 row
+
 v24.01 Public major release
  - Remove the legacy 'libarm_compute_core' library. This library is an artifact of Compute Library's legacy library architecture and no longer serves any purpose.
   You should link only to the main `libarm_compute` library for core functionality.
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
index 592ee72820..95ece8cdc8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -173,12 +173,30 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
 
     const auto n_output_channels = args.input_channels * args.channel_multiplier;
 
-    for (unsigned int batch = 0; batch < args.n_batches; batch++)
+    // By default we parallelize over the rows, but if there's only 1 row, we
+    // try to parallize over batches
+    auto thread_id_for_rows = thread_id;
+    auto n_threads_for_rows = n_threads;
+    auto thread_id_for_batches = 0;
+    auto n_threads_for_batches = 1;
+    if (args.output_rows == 1) {
+      thread_id_for_rows = 0;
+      n_threads_for_rows = 1;
+      thread_id_for_batches = thread_id;
+      n_threads_for_batches = n_threads;
+    }
+
+    // Progress the pointers for the first batch.
+    input_tensor.base += ld_input_batch*thread_id_for_batches;
+    output_tensor.base += ld_output_batch*thread_id_for_batches;
+    for (unsigned int batch = thread_id_for_batches;
+          batch < args.n_batches;
+          batch += n_threads_for_batches)
     {
       // Iterate over rows of the output tensor; we stripe over the tiles.
-      for (unsigned int start_output_i = thread_id * m_strat->get_output_rows();
+      for (unsigned int start_output_i = thread_id_for_rows * m_strat->get_output_rows();
            start_output_i < args.output_rows;
-           start_output_i += n_threads * m_strat->get_output_rows())
+           start_output_i += n_threads_for_rows * m_strat->get_output_rows())
       {
         // Determine what (if any padding) is required on the top/bottom of
         // this row of the convolution.
@@ -264,8 +282,8 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
       }
 
       // Progress the pointers for the next batch.
-      input_tensor.base += ld_input_batch;
-      output_tensor.base += ld_output_batch;
+      input_tensor.base += ld_input_batch*n_threads_for_batches;
+      output_tensor.base += ld_output_batch*n_threads_for_batches;
     }
   }
 
diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
index 8d3741de96..38092adfee 100644
--- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023 Arm Limited.
+ * Copyright (c) 2019-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -108,7 +108,11 @@ void CpuDepthwiseConv2dAssemblyDispatch::run(ITensorPack &tensors)
 
     prepare(tensors);
 
-    NEScheduler::get().schedule_op(_pImpl->asm_kernel.get(), Window::DimY, _pImpl->asm_kernel->window(), tensors);
+    // Split over rows (z) if there's more than 1, otherwise batches (w). This logic
+    // corresponds to the threading strategy in DepthFirstDriver::execute_internal
+    auto split_dimension = _pImpl->asm_kernel->window().num_iterations(Window::DimZ) == 1 ? Window::DimZ : Window::DimW;
+
+    NEScheduler::get().schedule_op(_pImpl->asm_kernel.get(), split_dimension, _pImpl->asm_kernel->window(), tensors);
 }
 
 void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors)