From 73bb6b7ad80801e56633ad4ea12b0404b586a979 Mon Sep 17 00:00:00 2001 From: Fadi Arafeh Date: Thu, 6 Oct 2022 16:20:14 +0000 Subject: ONCPUML-1072: Tuned MWS values (for N1, V1) for binary operators used by oneDNN Added approximate values for MWS for the following binary operators: Add, Sub, Mul, Min, Max, Div Change-Id: I5c4c75511129982a3f44c038ee272f09598469de Signed-off-by: Fadi Arafeh Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/459609 Tested-by: bsgcomp Reviewed-by: Viet-Hoa Do Comments-Addressed: bsgcomp Signed-off-by: fadara01 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8392 Reviewed-by: Gunes Bayir Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- src/cpu/kernels/CpuElementwiseKernel.cpp | 91 ++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) (limited to 'src/cpu/kernels/CpuElementwiseKernel.cpp') diff --git a/src/cpu/kernels/CpuElementwiseKernel.cpp b/src/cpu/kernels/CpuElementwiseKernel.cpp index 4b285fc2be..e76b05f296 100644 --- a/src/cpu/kernels/CpuElementwiseKernel.cpp +++ b/src/cpu/kernels/CpuElementwiseKernel.cpp @@ -32,6 +32,14 @@ #include +namespace +{ + static constexpr size_t default_min_max_mws_N1_fp32_neon = 25308; + static constexpr size_t default_min_max_mws_V1_fp32_neon = 34772; + static constexpr size_t default_div_mws_N1_fp32_neon = 19043; + static constexpr size_t default_div_mws_V1_fp32_neon = 25511; +} + namespace arm_compute { namespace cpu @@ -401,6 +409,48 @@ Status CpuArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo * return Status{}; } +size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count) const +{ + ARM_COMPUTE_UNUSED(thread_count); + +#if defined(ENABLE_FP32_KERNELS) + if(this->_run_method == &neon_fp32_elementwise_binary + || this->_run_method == &neon_fp32_elementwise_binary) + { + size_t mws = ICPPKernel::default_mws; + if(platform.get_cpu_model() == CPUModel::N1) + { + mws = default_min_max_mws_N1_fp32_neon; + } + else if(platform.get_cpu_model() == CPUModel::V1) + { + mws = default_min_max_mws_V1_fp32_neon; + } + else + { + return ICPPKernel::default_mws; + } + + // tensor is 1D or was re-interpreted as 1D + if(this->window().shape().num_dimensions() == 1) + { + return mws; + } + else + { + // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one + // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small + // but the other sizes are large, which boosts performance. + mws = static_cast(mws / (this->window().num_iterations_total() / this->window().num_iterations(1))); + return std::max(static_cast(1), mws); + } + } +#else /* ENABLE_FP32_KERNELS */ + ARM_COMPUTE_UNUSED(platform); +#endif /* ENABLE_FP32_KERNELS */ + return ICPPKernel::default_mws; +} + /** The division operator */ void CpuDivisionKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) @@ -410,6 +460,47 @@ void CpuDivisionKernel::configure(const ITensorInfo *src0, const ITensorInfo *sr CpuArithmeticKernel::configure_common(src0, src1, dst); } +size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count) const +{ + ARM_COMPUTE_UNUSED(thread_count); + +#if defined(ENABLE_FP32_KERNELS) + if(this->_run_method == &neon_fp32_elementwise_binary) + { + size_t mws = ICPPKernel::default_mws; + if(platform.get_cpu_model() == CPUModel::N1) + { + mws = default_div_mws_N1_fp32_neon; + } + else if(platform.get_cpu_model() == CPUModel::V1) + { + mws = default_div_mws_V1_fp32_neon; + } + else + { + return ICPPKernel::default_mws; + } + + // tensor is 1D or was re-interpreted as 1D + if(this->window().shape().num_dimensions() == 1) + { + return mws; + } + else + { + // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one + // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small + // but the other sizes are large, which boosts performance. + mws = static_cast(mws / (this->window().num_iterations_total() / this->window().num_iterations(1))); + return std::max(static_cast(1), mws); + } + } +#else /* ENABLE_FP32_KERNELS */ + ARM_COMPUTE_UNUSED(platform); +#endif /* ENABLE_FP32_KERNELS */ + return ICPPKernel::default_mws; +} + Status CpuDivisionKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::S32, DataType::F16, DataType::F32); -- cgit v1.2.1