aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/kernels/CpuMulKernel.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/cpu/kernels/CpuMulKernel.cpp')
-rw-r--r--src/cpu/kernels/CpuMulKernel.cpp46
1 files changed, 46 insertions, 0 deletions
diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp
index 82e5445321..81bb85c3dd 100644
--- a/src/cpu/kernels/CpuMulKernel.cpp
+++ b/src/cpu/kernels/CpuMulKernel.cpp
@@ -34,6 +34,11 @@
#include <arm_neon.h>
+namespace
+{
+ static constexpr size_t default_mws_N1_fp32_neon = 22447;
+ static constexpr size_t default_mws_V1_fp32_neon = 38982;
+}
namespace arm_compute
{
namespace cpu
@@ -1909,6 +1914,47 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
ICpuKernel::configure(win);
}
+size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+ ARM_COMPUTE_UNUSED(thread_count);
+
+#if defined(ENABLE_FP32_KERNELS)
+ if(this->_func_float == &mul_F32_F32_F32)
+ {
+ size_t mws = ICPPKernel::default_mws;
+ if(platform.get_cpu_model() == CPUModel::N1)
+ {
+ mws = default_mws_N1_fp32_neon;
+ }
+ else if(platform.get_cpu_model() == CPUModel::V1)
+ {
+ mws = default_mws_V1_fp32_neon;
+ }
+ else
+ {
+ return ICPPKernel::default_mws;
+ }
+
+ // tensor is 1D or was re-interpreted as 1D
+ if(this->window().shape().num_dimensions() == 1)
+ {
+ return mws;
+ }
+ else
+ {
+ // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
+ // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
+ // but the other sizes are large, which boosts performance.
+ mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
+ return std::max(static_cast<size_t>(1), mws);
+ }
+ }
+#else /* ENABLE_FP32_KERNELS */
+ ARM_COMPUTE_UNUSED(platform);
+#endif /* ENABLE_FP32_KERNELS */
+ return ICPPKernel::default_mws;
+}
+
Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy,
RoundingPolicy rounding_policy)
{