1 files changed, 91 insertions, 0 deletions
diff --git a/src/cpu/kernels/CpuElementwiseKernel.cpp b/src/cpu/kernels/CpuElementwiseKernel.cpp
index 4b285fc2be..e76b05f296 100644
--- a/src/cpu/kernels/CpuElementwiseKernel.cpp
+++ b/src/cpu/kernels/CpuElementwiseKernel.cpp
@@ -32,6 +32,14 @@
 
 #include <arm_neon.h>
 
+namespace
+{
+    static constexpr size_t default_min_max_mws_N1_fp32_neon = 25308;
+    static constexpr size_t default_min_max_mws_V1_fp32_neon = 34772;
+    static constexpr size_t default_div_mws_N1_fp32_neon = 19043;
+    static constexpr size_t default_div_mws_V1_fp32_neon = 25511;
+}
+
 namespace arm_compute
 {
 namespace cpu
@@ -401,6 +409,48 @@ Status CpuArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *
     return Status{};
 }
 
+size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+    ARM_COMPUTE_UNUSED(thread_count);
+
+#if defined(ENABLE_FP32_KERNELS)
+    if(this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MIN>
+    || this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MAX>)
+    {
+        size_t mws = ICPPKernel::default_mws;
+        if(platform.get_cpu_model() == CPUModel::N1)
+        {
+            mws = default_min_max_mws_N1_fp32_neon;
+        }
+        else if(platform.get_cpu_model() == CPUModel::V1)
+        {
+            mws = default_min_max_mws_V1_fp32_neon;
+        }
+        else
+        {
+            return ICPPKernel::default_mws;
+        }
+
+        // tensor is 1D or was re-interpreted as 1D
+        if(this->window().shape().num_dimensions() == 1)
+        {
+            return mws;
+        }
+        else
+        {
+            // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
+            // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
+            // but the other sizes are large, which boosts performance.
+            mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
+            return std::max(static_cast<size_t>(1), mws);
+        }
+    }
+#else /* ENABLE_FP32_KERNELS */
+    ARM_COMPUTE_UNUSED(platform);
+#endif /* ENABLE_FP32_KERNELS */
+    return ICPPKernel::default_mws;
+}
+
 /** The division operator */
 
 void CpuDivisionKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
@@ -410,6 +460,47 @@ void CpuDivisionKernel::configure(const ITensorInfo *src0, const ITensorInfo *sr
     CpuArithmeticKernel::configure_common(src0, src1, dst);
 }
 
+size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+    ARM_COMPUTE_UNUSED(thread_count);
+
+#if defined(ENABLE_FP32_KERNELS)
+    if(this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::DIV>)
+    {
+        size_t mws = ICPPKernel::default_mws;
+        if(platform.get_cpu_model() == CPUModel::N1)
+        {
+            mws = default_div_mws_N1_fp32_neon;
+        }
+        else if(platform.get_cpu_model() == CPUModel::V1)
+        {
+            mws = default_div_mws_V1_fp32_neon;
+        }
+        else
+        {
+            return ICPPKernel::default_mws;
+        }
+
+        // tensor is 1D or was re-interpreted as 1D
+        if(this->window().shape().num_dimensions() == 1)
+        {
+            return mws;
+        }
+        else
+        {
+            // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
+            // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
+            // but the other sizes are large, which boosts performance.
+            mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
+            return std::max(static_cast<size_t>(1), mws);
+        }
+    }
+#else /* ENABLE_FP32_KERNELS */
+    ARM_COMPUTE_UNUSED(platform);
+#endif /* ENABLE_FP32_KERNELS */
+    return ICPPKernel::default_mws;
+}
+
 Status CpuDivisionKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::S32, DataType::F16, DataType::F32);