aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/kernels/CpuSubKernel.cpp
diff options
context:
space:
mode:
authorFadi Arafeh <fadi.arafeh@arm.com>2022-10-06 16:20:14 +0000
committerfadi.arafeh <fadi.arafeh@arm.com>2022-11-22 14:04:45 +0000
commit73bb6b7ad80801e56633ad4ea12b0404b586a979 (patch)
tree9f35a75499df4e1cc49cc6f3336c805384a53c13 /src/cpu/kernels/CpuSubKernel.cpp
parentca1a52d14551147456a9a1ea2e24f5c141a6d80e (diff)
downloadComputeLibrary-73bb6b7ad80801e56633ad4ea12b0404b586a979.tar.gz
ONCPUML-1072: Tuned MWS values (for N1, V1) for binary operators used by oneDNN
Added approximate values for MWS for the following binary operators: Add, Sub, Mul, Min, Max, Div Change-Id: I5c4c75511129982a3f44c038ee272f09598469de Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com> Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/459609 Tested-by: bsgcomp <bsgcomp@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Comments-Addressed: bsgcomp <bsgcomp@arm.com> Signed-off-by: fadara01 <fadi.arafeh@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8392 Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/cpu/kernels/CpuSubKernel.cpp')
-rw-r--r--src/cpu/kernels/CpuSubKernel.cpp46
1 files changed, 46 insertions, 0 deletions
diff --git a/src/cpu/kernels/CpuSubKernel.cpp b/src/cpu/kernels/CpuSubKernel.cpp
index d908e4ed28..ad74dda85d 100644
--- a/src/cpu/kernels/CpuSubKernel.cpp
+++ b/src/cpu/kernels/CpuSubKernel.cpp
@@ -31,6 +31,11 @@
#include "src/core/helpers/WindowHelpers.h"
#include "src/cpu/kernels/sub/neon/list.h"
+namespace
+{
+ static constexpr size_t default_mws_N1_fp32_neon = 24385;
+ static constexpr size_t default_mws_V1_fp32_neon = 40520;
+}
namespace arm_compute
{
namespace cpu
@@ -137,6 +142,47 @@ void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I
ICpuKernel::configure(win);
}
+size_t CpuSubKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+ ARM_COMPUTE_UNUSED(thread_count);
+
+#if defined(ENABLE_FP32_KERNELS)
+ if(this->_run_method == &sub_same_neon<float>)
+ {
+ size_t mws = ICPPKernel::default_mws;
+ if(platform.get_cpu_model() == CPUModel::N1)
+ {
+ mws = default_mws_N1_fp32_neon;
+ }
+ else if(platform.get_cpu_model() == CPUModel::V1)
+ {
+ mws = default_mws_V1_fp32_neon;
+ }
+ else
+ {
+ return ICPPKernel::default_mws;
+ }
+
+ // tensor is 1D or was re-interpreted as 1D
+ if(this->window().shape().num_dimensions() == 1)
+ {
+ return mws;
+ }
+ else
+ {
+ // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
+ // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
+ // but the other sizes are large, which boosts performance.
+ mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
+ return std::max(static_cast<size_t>(1), mws);
+ }
+ }
+#else /* ENABLE_FP32_KERNELS */
+ ARM_COMPUTE_UNUSED(platform);
+#endif /* ENABLE_FP32_KERNELS */
+ return ICPPKernel::default_mws;
+}
+
Status CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);