diff options
author | Fadi Arafeh <fadi.arafeh@arm.com> | 2022-10-06 16:20:14 +0000 |
---|---|---|
committer | fadi.arafeh <fadi.arafeh@arm.com> | 2022-11-22 14:04:45 +0000 |
commit | 73bb6b7ad80801e56633ad4ea12b0404b586a979 (patch) | |
tree | 9f35a75499df4e1cc49cc6f3336c805384a53c13 /src/cpu/kernels/CpuAddKernel.cpp | |
parent | ca1a52d14551147456a9a1ea2e24f5c141a6d80e (diff) | |
download | ComputeLibrary-73bb6b7ad80801e56633ad4ea12b0404b586a979.tar.gz |
ONCPUML-1072: Tuned MWS values (for N1, V1) for binary operators used by oneDNN
Added approximate values for MWS for the following binary operators:
Add, Sub, Mul, Min, Max, Div
Change-Id: I5c4c75511129982a3f44c038ee272f09598469de
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/459609
Tested-by: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Comments-Addressed: bsgcomp <bsgcomp@arm.com>
Signed-off-by: fadara01 <fadi.arafeh@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8392
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/cpu/kernels/CpuAddKernel.cpp')
-rw-r--r-- | src/cpu/kernels/CpuAddKernel.cpp | 40 |
1 files changed, 39 insertions, 1 deletions
diff --git a/src/cpu/kernels/CpuAddKernel.cpp b/src/cpu/kernels/CpuAddKernel.cpp index 1648a46cdc..ec210a4a71 100644 --- a/src/cpu/kernels/CpuAddKernel.cpp +++ b/src/cpu/kernels/CpuAddKernel.cpp @@ -33,6 +33,11 @@ #include "src/cpu/kernels/add/list.h" #include <array> +namespace +{ + static constexpr size_t default_mws_N1_fp32_neon = 24536; + static constexpr size_t default_mws_V1_fp32_neon = 40510; +} namespace arm_compute { namespace cpu @@ -267,8 +272,41 @@ const std::vector<CpuAddKernel::AddKernel> &CpuAddKernel::get_available_kernels( size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const { ARM_COMPUTE_UNUSED(thread_count); - ARM_COMPUTE_UNUSED(platform); +#if defined(ENABLE_FP32_KERNELS) + if(this->_run_method == &add_fp32_neon) + { + size_t mws = ICPPKernel::default_mws; + if(platform.get_cpu_model() == CPUModel::N1) + { + mws = default_mws_N1_fp32_neon; + } + else if(platform.get_cpu_model() == CPUModel::V1) + { + mws = default_mws_V1_fp32_neon; + } + else + { + return ICPPKernel::default_mws; + } + + // tensor is 1D or was re-interpreted as 1D + if(this->window().shape().num_dimensions() == 1) + { + return mws; + } + else + { + // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one + // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small + // but the other sizes are large, which boosts performance. + mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1))); + return std::max(static_cast<size_t>(1), mws); + } + } +#else /* ENABLE_FP32_KERNELS */ + ARM_COMPUTE_UNUSED(platform); +#endif /* ENABLE_FP32_KERNELS */ return ICPPKernel::default_mws; } |