From d4a9cc00a666c7d4c2a35c49d71b322f27e369fc Mon Sep 17 00:00:00 2001 From: Viet-Hoa Do Date: Tue, 8 Nov 2022 12:01:21 +0000 Subject: Fix CPU multiplication layer threading overhead * When the tensors are reinterpreted as 1D, any thing smaller than 10KB won't be splitted into different thread. Resolves: COMPMID-5630 Signed-off-by: Viet-Hoa Do Change-Id: Icff7089e37c85c8b325f099008a080a5805d36a2 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8581 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir Comments-Addressed: Arm Jenkins --- src/cpu/kernels/CpuMulKernel.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'src/cpu/kernels/CpuMulKernel.cpp') diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp index 487954b889..35a9958f65 100644 --- a/src/cpu/kernels/CpuMulKernel.cpp +++ b/src/cpu/kernels/CpuMulKernel.cpp @@ -1941,10 +1941,26 @@ void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const Thre (*_func_float)(src1, src2, dst, window, _scale); } } + const char *CpuMulKernel::name() const { return "CpuMulKernel"; } + +size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const +{ + ARM_COMPUTE_UNUSED(platform, thread_count); + + if(_split_dimension == Window::DimX) + { + // Don't split the work load too small if the tensor has been reinterpreted as 1D. + // This number is loosely chosen as threading overhead in each platform varies wildly. + return 10240; + } + + return default_mws; +} + namespace { Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) -- cgit v1.2.1