From 73bb6b7ad80801e56633ad4ea12b0404b586a979 Mon Sep 17 00:00:00 2001 From: Fadi Arafeh Date: Thu, 6 Oct 2022 16:20:14 +0000 Subject: ONCPUML-1072: Tuned MWS values (for N1, V1) for binary operators used by oneDNN Added approximate values for MWS for the following binary operators: Add, Sub, Mul, Min, Max, Div Change-Id: I5c4c75511129982a3f44c038ee272f09598469de Signed-off-by: Fadi Arafeh Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/459609 Tested-by: bsgcomp Reviewed-by: Viet-Hoa Do Comments-Addressed: bsgcomp Signed-off-by: fadara01 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8392 Reviewed-by: Gunes Bayir Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- arm_compute/core/CPP/CPPTypes.h | 3 +- src/common/cpuinfo/CpuModel.cpp | 8 ++- src/cpu/kernels/CpuAddKernel.cpp | 40 +++++++++++++- src/cpu/kernels/CpuAddKernel.h | 2 +- src/cpu/kernels/CpuElementwiseKernel.cpp | 91 ++++++++++++++++++++++++++++++++ src/cpu/kernels/CpuElementwiseKernel.h | 32 ++++++----- src/cpu/kernels/CpuMulKernel.cpp | 46 ++++++++++++++++ src/cpu/kernels/CpuMulKernel.h | 9 ++++ src/cpu/kernels/CpuSubKernel.cpp | 46 ++++++++++++++++ src/cpu/kernels/CpuSubKernel.h | 9 ++++ 10 files changed, 269 insertions(+), 17 deletions(-) diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h index afefb1aeb0..c3fb47fb61 100644 --- a/arm_compute/core/CPP/CPPTypes.h +++ b/arm_compute/core/CPP/CPPTypes.h @@ -48,7 +48,8 @@ struct CpuIsaInfo; X(A510) \ X(X1) \ X(V1) \ - X(A64FX) + X(A64FX) \ + X(N1) /** CPU models types * diff --git a/src/common/cpuinfo/CpuModel.cpp b/src/common/cpuinfo/CpuModel.cpp index 6382ffd5b4..d6d91df133 100644 --- a/src/common/cpuinfo/CpuModel.cpp +++ b/src/common/cpuinfo/CpuModel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -54,6 +54,7 @@ bool model_supports_fp16(CpuModel model) case CpuModel::X1: case CpuModel::V1: case CpuModel::A64FX: + case CpuModel::N1: return true; default: return false; @@ -69,6 +70,7 @@ bool model_supports_dot(CpuModel model) case CpuModel::A510: case CpuModel::X1: case CpuModel::V1: + case CpuModel::N1: return true; default: return false; @@ -116,9 +118,11 @@ CpuModel midr_to_model(uint32_t midr) model = CpuModel::GENERIC_FP16; } break; + case 0xd0c: // N1 + model = CpuModel::N1; + break; case 0xd06: // A65 case 0xd0b: // A76 - case 0xd0c: // N1 case 0xd0d: // A77 case 0xd0e: // A76AE case 0xd41: // A78 diff --git a/src/cpu/kernels/CpuAddKernel.cpp b/src/cpu/kernels/CpuAddKernel.cpp index 1648a46cdc..ec210a4a71 100644 --- a/src/cpu/kernels/CpuAddKernel.cpp +++ b/src/cpu/kernels/CpuAddKernel.cpp @@ -33,6 +33,11 @@ #include "src/cpu/kernels/add/list.h" #include +namespace +{ + static constexpr size_t default_mws_N1_fp32_neon = 24536; + static constexpr size_t default_mws_V1_fp32_neon = 40510; +} namespace arm_compute { namespace cpu @@ -267,8 +272,41 @@ const std::vector &CpuAddKernel::get_available_kernels( size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const { ARM_COMPUTE_UNUSED(thread_count); - ARM_COMPUTE_UNUSED(platform); +#if defined(ENABLE_FP32_KERNELS) + if(this->_run_method == &add_fp32_neon) + { + size_t mws = ICPPKernel::default_mws; + if(platform.get_cpu_model() == CPUModel::N1) + { + mws = default_mws_N1_fp32_neon; + } + else if(platform.get_cpu_model() == CPUModel::V1) + { + mws = default_mws_V1_fp32_neon; + } + else + { + return ICPPKernel::default_mws; + } + + // tensor is 1D or was re-interpreted as 1D + if(this->window().shape().num_dimensions() == 1) + { + return mws; + } + else + { + // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one + // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small + // but the other sizes are large, which boosts performance. + mws = static_cast(mws / (this->window().num_iterations_total() / this->window().num_iterations(1))); + return std::max(static_cast(1), mws); + } + } +#else /* ENABLE_FP32_KERNELS */ + ARM_COMPUTE_UNUSED(platform); +#endif /* ENABLE_FP32_KERNELS */ return ICPPKernel::default_mws; } diff --git a/src/cpu/kernels/CpuAddKernel.h b/src/cpu/kernels/CpuAddKernel.h index e2062c8c33..9921feabe2 100644 --- a/src/cpu/kernels/CpuAddKernel.h +++ b/src/cpu/kernels/CpuAddKernel.h @@ -85,7 +85,7 @@ public: * @param[in] platform The CPU platform used to create the context. * @param[in] thread_count Number of threads in the execution. * - * @return[out] small_network_mws Minimum workload size for requsted configuration. + * @return[out] mws Minimum workload size for requested configuration. */ size_t get_mws(const CPUInfo &platform, size_t thread_count) const override; diff --git a/src/cpu/kernels/CpuElementwiseKernel.cpp b/src/cpu/kernels/CpuElementwiseKernel.cpp index 4b285fc2be..e76b05f296 100644 --- a/src/cpu/kernels/CpuElementwiseKernel.cpp +++ b/src/cpu/kernels/CpuElementwiseKernel.cpp @@ -32,6 +32,14 @@ #include +namespace +{ + static constexpr size_t default_min_max_mws_N1_fp32_neon = 25308; + static constexpr size_t default_min_max_mws_V1_fp32_neon = 34772; + static constexpr size_t default_div_mws_N1_fp32_neon = 19043; + static constexpr size_t default_div_mws_V1_fp32_neon = 25511; +} + namespace arm_compute { namespace cpu @@ -401,6 +409,48 @@ Status CpuArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo * return Status{}; } +size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count) const +{ + ARM_COMPUTE_UNUSED(thread_count); + +#if defined(ENABLE_FP32_KERNELS) + if(this->_run_method == &neon_fp32_elementwise_binary + || this->_run_method == &neon_fp32_elementwise_binary) + { + size_t mws = ICPPKernel::default_mws; + if(platform.get_cpu_model() == CPUModel::N1) + { + mws = default_min_max_mws_N1_fp32_neon; + } + else if(platform.get_cpu_model() == CPUModel::V1) + { + mws = default_min_max_mws_V1_fp32_neon; + } + else + { + return ICPPKernel::default_mws; + } + + // tensor is 1D or was re-interpreted as 1D + if(this->window().shape().num_dimensions() == 1) + { + return mws; + } + else + { + // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one + // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small + // but the other sizes are large, which boosts performance. + mws = static_cast(mws / (this->window().num_iterations_total() / this->window().num_iterations(1))); + return std::max(static_cast(1), mws); + } + } +#else /* ENABLE_FP32_KERNELS */ + ARM_COMPUTE_UNUSED(platform); +#endif /* ENABLE_FP32_KERNELS */ + return ICPPKernel::default_mws; +} + /** The division operator */ void CpuDivisionKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) @@ -410,6 +460,47 @@ void CpuDivisionKernel::configure(const ITensorInfo *src0, const ITensorInfo *sr CpuArithmeticKernel::configure_common(src0, src1, dst); } +size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count) const +{ + ARM_COMPUTE_UNUSED(thread_count); + +#if defined(ENABLE_FP32_KERNELS) + if(this->_run_method == &neon_fp32_elementwise_binary) + { + size_t mws = ICPPKernel::default_mws; + if(platform.get_cpu_model() == CPUModel::N1) + { + mws = default_div_mws_N1_fp32_neon; + } + else if(platform.get_cpu_model() == CPUModel::V1) + { + mws = default_div_mws_V1_fp32_neon; + } + else + { + return ICPPKernel::default_mws; + } + + // tensor is 1D or was re-interpreted as 1D + if(this->window().shape().num_dimensions() == 1) + { + return mws; + } + else + { + // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one + // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small + // but the other sizes are large, which boosts performance. + mws = static_cast(mws / (this->window().num_iterations_total() / this->window().num_iterations(1))); + return std::max(static_cast(1), mws); + } + } +#else /* ENABLE_FP32_KERNELS */ + ARM_COMPUTE_UNUSED(platform); +#endif /* ENABLE_FP32_KERNELS */ + return ICPPKernel::default_mws; +} + Status CpuDivisionKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::S32, DataType::F16, DataType::F32); diff --git a/src/cpu/kernels/CpuElementwiseKernel.h b/src/cpu/kernels/CpuElementwiseKernel.h index 2785e0a44c..634e38bf9f 100644 --- a/src/cpu/kernels/CpuElementwiseKernel.h +++ b/src/cpu/kernels/CpuElementwiseKernel.h @@ -72,8 +72,8 @@ protected: static Status validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst); protected: - std::function _run_method{ nullptr }; - std::string _name{}; + ElementwiseKernelPtr _run_method{ nullptr }; + std::string _name{}; }; class CpuArithmeticKernel : public CpuElementwiseKernel @@ -100,6 +100,15 @@ public: static const std::vector::ElementwiseKernel> &get_available_kernels(); + /** Return minimum workload size of the relevant kernel + * + * @param[in] platform The CPU platform used to create the context. + * @param[in] thread_count Number of threads in the execution. + * + * @return[out] mws Minimum workload size for requested configuration. + */ + size_t get_mws(const CPUInfo &platform, size_t thread_count) const override; + protected: /** Commmon configure function for element-wise operators with no additional options (e.g. Min, Max, SquaredDiff) */ @@ -108,16 +117,6 @@ protected: static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst); ArithmeticOperation _op{}; - -private: - /** Function to get the micro kernel implementation - * - * @param[in] src0 First input tensor information - * @param[in] src1 Second input tensor information - * @param[in] dst Output tensor information - * - * @return the function instance for the micro kernel - */ }; class CpuDivisionKernel : public CpuArithmeticKernel @@ -141,6 +140,15 @@ public: */ static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); + /** Return minimum workload size of the relevant kernel + * + * @param[in] platform The CPU platform used to create the context. + * @param[in] thread_count Number of threads in the execution. + * + * @return[out] mws Minimum workload size for requested configuration. + */ + size_t get_mws(const CPUInfo &platform, size_t thread_count) const override; + protected: // Inherited methods overridden: static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst); diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp index 82e5445321..81bb85c3dd 100644 --- a/src/cpu/kernels/CpuMulKernel.cpp +++ b/src/cpu/kernels/CpuMulKernel.cpp @@ -34,6 +34,11 @@ #include +namespace +{ + static constexpr size_t default_mws_N1_fp32_neon = 22447; + static constexpr size_t default_mws_V1_fp32_neon = 38982; +} namespace arm_compute { namespace cpu @@ -1909,6 +1914,47 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo * ICpuKernel::configure(win); } +size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const +{ + ARM_COMPUTE_UNUSED(thread_count); + +#if defined(ENABLE_FP32_KERNELS) + if(this->_func_float == &mul_F32_F32_F32) + { + size_t mws = ICPPKernel::default_mws; + if(platform.get_cpu_model() == CPUModel::N1) + { + mws = default_mws_N1_fp32_neon; + } + else if(platform.get_cpu_model() == CPUModel::V1) + { + mws = default_mws_V1_fp32_neon; + } + else + { + return ICPPKernel::default_mws; + } + + // tensor is 1D or was re-interpreted as 1D + if(this->window().shape().num_dimensions() == 1) + { + return mws; + } + else + { + // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one + // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small + // but the other sizes are large, which boosts performance. + mws = static_cast(mws / (this->window().num_iterations_total() / this->window().num_iterations(1))); + return std::max(static_cast(1), mws); + } + } +#else /* ENABLE_FP32_KERNELS */ + ARM_COMPUTE_UNUSED(platform); +#endif /* ENABLE_FP32_KERNELS */ + return ICPPKernel::default_mws; +} + Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) { diff --git a/src/cpu/kernels/CpuMulKernel.h b/src/cpu/kernels/CpuMulKernel.h index c92e1efdf4..73ffc0dd2b 100644 --- a/src/cpu/kernels/CpuMulKernel.h +++ b/src/cpu/kernels/CpuMulKernel.h @@ -81,6 +81,15 @@ public: const char *name() const override; size_t get_mws(const CPUInfo &platform, size_t thread_count) const override; + /** Return minimum workload size of the relevant kernel + * + * @param[in] platform The CPU platform used to create the context. + * @param[in] thread_count Number of threads in the execution. + * + * @return[out] mws Minimum workload size for requested configuration. + */ + size_t get_mws(const CPUInfo &platform, size_t thread_count) const override; + /** Get the preferred dimension in which the scheduler splits the work into multiple jobs. * * @return The split dimension hint. diff --git a/src/cpu/kernels/CpuSubKernel.cpp b/src/cpu/kernels/CpuSubKernel.cpp index d908e4ed28..ad74dda85d 100644 --- a/src/cpu/kernels/CpuSubKernel.cpp +++ b/src/cpu/kernels/CpuSubKernel.cpp @@ -31,6 +31,11 @@ #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/sub/neon/list.h" +namespace +{ + static constexpr size_t default_mws_N1_fp32_neon = 24385; + static constexpr size_t default_mws_V1_fp32_neon = 40520; +} namespace arm_compute { namespace cpu @@ -137,6 +142,47 @@ void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I ICpuKernel::configure(win); } +size_t CpuSubKernel::get_mws(const CPUInfo &platform, size_t thread_count) const +{ + ARM_COMPUTE_UNUSED(thread_count); + +#if defined(ENABLE_FP32_KERNELS) + if(this->_run_method == &sub_same_neon) + { + size_t mws = ICPPKernel::default_mws; + if(platform.get_cpu_model() == CPUModel::N1) + { + mws = default_mws_N1_fp32_neon; + } + else if(platform.get_cpu_model() == CPUModel::V1) + { + mws = default_mws_V1_fp32_neon; + } + else + { + return ICPPKernel::default_mws; + } + + // tensor is 1D or was re-interpreted as 1D + if(this->window().shape().num_dimensions() == 1) + { + return mws; + } + else + { + // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one + // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small + // but the other sizes are large, which boosts performance. + mws = static_cast(mws / (this->window().num_iterations_total() / this->window().num_iterations(1))); + return std::max(static_cast(1), mws); + } + } +#else /* ENABLE_FP32_KERNELS */ + ARM_COMPUTE_UNUSED(platform); +#endif /* ENABLE_FP32_KERNELS */ + return ICPPKernel::default_mws; +} + Status CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); diff --git a/src/cpu/kernels/CpuSubKernel.h b/src/cpu/kernels/CpuSubKernel.h index e835bac3d5..3d80b34279 100644 --- a/src/cpu/kernels/CpuSubKernel.h +++ b/src/cpu/kernels/CpuSubKernel.h @@ -73,6 +73,15 @@ public: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; + /** Return minimum workload size of the relevant kernel + * + * @param[in] platform The CPU platform used to create the context. + * @param[in] thread_count Number of threads in the execution. + * + * @return[out] mws Minimum workload size for requested configuration. + */ + size_t get_mws(const CPUInfo &platform, size_t thread_count) const override; + struct SubKernel { const char *name; -- cgit v1.2.1