Parallelize CPU depthwise over batch if only 1 row

This patch also fixes a bug where the split dimension was wrong in CpuDepthwiseConv2dAssemblyDispatch::run. It was set to DimY, which is cols, but it should have been DimZ. This was rarely an issue in practice because typically the number of cols are greater than the number of threads anyway. Relates to: ONCPUML-1443 Co-authored-by: Milos Puzovic <Milos.Puzovic@arm.com> Change-Id: Ifed2fce22ddeb7cd77e6a6ae1083694427f91e04 Signed-off-by: Jonathan Deakin <jonathan.deakin@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11083 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Jonathan Deakin <jonathan.deakin@arm.com> 2024-02-05 15:32:31 +0000
committer: Jonathan Deakin <jonathan.deakin@arm.com> 2024-02-07 09:15:39 +0000
commit: 2db938cd1d026deb3689c668dd7031c00b9b339d (patch)
tree: a1f046ae6a53cddf3772a895107be9457f3359cd /src/core/NEON/kernels
parent: e695579911fbe6aa06b11dbeeec7af5637a92f2b (diff)
download: ComputeLibrary-2db938cd1d026deb3689c668dd7031c00b9b339d.tar.gz
1 files changed, 24 insertions, 6 deletions
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
index 592ee72820..95ece8cdc8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -173,12 +173,30 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
 
     const auto n_output_channels = args.input_channels * args.channel_multiplier;
 
-    for (unsigned int batch = 0; batch < args.n_batches; batch++)
+    // By default we parallelize over the rows, but if there's only 1 row, we
+    // try to parallize over batches
+    auto thread_id_for_rows = thread_id;
+    auto n_threads_for_rows = n_threads;
+    auto thread_id_for_batches = 0;
+    auto n_threads_for_batches = 1;
+    if (args.output_rows == 1) {
+      thread_id_for_rows = 0;
+      n_threads_for_rows = 1;
+      thread_id_for_batches = thread_id;
+      n_threads_for_batches = n_threads;
+    }
+
+    // Progress the pointers for the first batch.
+    input_tensor.base += ld_input_batch*thread_id_for_batches;
+    output_tensor.base += ld_output_batch*thread_id_for_batches;
+    for (unsigned int batch = thread_id_for_batches;
+          batch < args.n_batches;
+          batch += n_threads_for_batches)
     {
       // Iterate over rows of the output tensor; we stripe over the tiles.
-      for (unsigned int start_output_i = thread_id * m_strat->get_output_rows();
+      for (unsigned int start_output_i = thread_id_for_rows * m_strat->get_output_rows();
            start_output_i < args.output_rows;
-           start_output_i += n_threads * m_strat->get_output_rows())
+           start_output_i += n_threads_for_rows * m_strat->get_output_rows())
       {
         // Determine what (if any padding) is required on the top/bottom of
         // this row of the convolution.
@@ -264,8 +282,8 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
       }
 
       // Progress the pointers for the next batch.
-      input_tensor.base += ld_input_batch;
-      output_tensor.base += ld_output_batch;
+      input_tensor.base += ld_input_batch*n_threads_for_batches;
+      output_tensor.base += ld_output_batch*n_threads_for_batches;
     }
   }
author	Jonathan Deakin <jonathan.deakin@arm.com>	2024-02-05 15:32:31 +0000
committer	Jonathan Deakin <jonathan.deakin@arm.com>	2024-02-07 09:15:39 +0000
commit	2db938cd1d026deb3689c668dd7031c00b9b339d (patch)
tree	a1f046ae6a53cddf3772a895107be9457f3359cd /src/core/NEON/kernels
parent	e695579911fbe6aa06b11dbeeec7af5637a92f2b (diff)
download	ComputeLibrary-2db938cd1d026deb3689c668dd7031c00b9b339d.tar.gz