From d4a9cc00a666c7d4c2a35c49d71b322f27e369fc Mon Sep 17 00:00:00 2001
From: Viet-Hoa Do <viet-hoa.do@arm.com>
Date: Tue, 8 Nov 2022 12:01:21 +0000
Subject: Fix CPU multiplication layer threading overhead

* When the tensors are reinterpreted as 1D, any thing smaller than
  10KB won't be splitted into different thread.

Resolves: COMPMID-5630
Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Change-Id: Icff7089e37c85c8b325f099008a080a5805d36a2
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8581
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 src/cpu/kernels/CpuMulKernel.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'src/cpu/kernels/CpuMulKernel.cpp')

diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp
index 487954b889..35a9958f65 100644
--- a/src/cpu/kernels/CpuMulKernel.cpp
+++ b/src/cpu/kernels/CpuMulKernel.cpp
@@ -1941,10 +1941,26 @@ void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const Thre
         (*_func_float)(src1, src2, dst, window, _scale);
     }
 }
+
 const char *CpuMulKernel::name() const
 {
     return "CpuMulKernel";
 }
+
+size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+    ARM_COMPUTE_UNUSED(platform, thread_count);
+
+    if(_split_dimension == Window::DimX)
+    {
+        // Don't split the work load too small if the tensor has been reinterpreted as 1D.
+        // This number is loosely chosen as threading overhead in each platform varies wildly.
+        return 10240;
+    }
+
+    return default_mws;
+}
+
 namespace
 {
 Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
-- 
cgit v1.2.1