From 2db938cd1d026deb3689c668dd7031c00b9b339d Mon Sep 17 00:00:00 2001 From: Jonathan Deakin Date: Mon, 5 Feb 2024 15:32:31 +0000 Subject: Parallelize CPU depthwise over batch if only 1 row This patch also fixes a bug where the split dimension was wrong in CpuDepthwiseConv2dAssemblyDispatch::run. It was set to DimY, which is cols, but it should have been DimZ. This was rarely an issue in practice because typically the number of cols are greater than the number of threads anyway. Relates to: ONCPUML-1443 Co-authored-by: Milos Puzovic Change-Id: Ifed2fce22ddeb7cd77e6a6ae1083694427f91e04 Signed-off-by: Jonathan Deakin Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11083 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Jakub Sujak Comments-Addressed: Arm Jenkins --- src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'src/cpu') diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp index 8d3741de96..38092adfee 100644 --- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp +++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023 Arm Limited. + * Copyright (c) 2019-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -108,7 +108,11 @@ void CpuDepthwiseConv2dAssemblyDispatch::run(ITensorPack &tensors) prepare(tensors); - NEScheduler::get().schedule_op(_pImpl->asm_kernel.get(), Window::DimY, _pImpl->asm_kernel->window(), tensors); + // Split over rows (z) if there's more than 1, otherwise batches (w). This logic + // corresponds to the threading strategy in DepthFirstDriver::execute_internal + auto split_dimension = _pImpl->asm_kernel->window().num_iterations(Window::DimZ) == 1 ? Window::DimZ : Window::DimW; + + NEScheduler::get().schedule_op(_pImpl->asm_kernel.get(), split_dimension, _pImpl->asm_kernel->window(), tensors); } void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors) -- cgit v1.2.1