aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFadi Arafeh <fadi.arafeh@arm.com>2022-10-06 16:20:14 +0000
committerfadi.arafeh <fadi.arafeh@arm.com>2022-11-22 14:04:45 +0000
commit73bb6b7ad80801e56633ad4ea12b0404b586a979 (patch)
tree9f35a75499df4e1cc49cc6f3336c805384a53c13
parentca1a52d14551147456a9a1ea2e24f5c141a6d80e (diff)
downloadComputeLibrary-73bb6b7ad80801e56633ad4ea12b0404b586a979.tar.gz
ONCPUML-1072: Tuned MWS values (for N1, V1) for binary operators used by oneDNN
Added approximate values for MWS for the following binary operators: Add, Sub, Mul, Min, Max, Div Change-Id: I5c4c75511129982a3f44c038ee272f09598469de Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com> Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/459609 Tested-by: bsgcomp <bsgcomp@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Comments-Addressed: bsgcomp <bsgcomp@arm.com> Signed-off-by: fadara01 <fadi.arafeh@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8392 Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--arm_compute/core/CPP/CPPTypes.h3
-rw-r--r--src/common/cpuinfo/CpuModel.cpp8
-rw-r--r--src/cpu/kernels/CpuAddKernel.cpp40
-rw-r--r--src/cpu/kernels/CpuAddKernel.h2
-rw-r--r--src/cpu/kernels/CpuElementwiseKernel.cpp91
-rw-r--r--src/cpu/kernels/CpuElementwiseKernel.h32
-rw-r--r--src/cpu/kernels/CpuMulKernel.cpp46
-rw-r--r--src/cpu/kernels/CpuMulKernel.h9
-rw-r--r--src/cpu/kernels/CpuSubKernel.cpp46
-rw-r--r--src/cpu/kernels/CpuSubKernel.h9
10 files changed, 269 insertions, 17 deletions
diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h
index afefb1aeb0..c3fb47fb61 100644
--- a/arm_compute/core/CPP/CPPTypes.h
+++ b/arm_compute/core/CPP/CPPTypes.h
@@ -48,7 +48,8 @@ struct CpuIsaInfo;
X(A510) \
X(X1) \
X(V1) \
- X(A64FX)
+ X(A64FX) \
+ X(N1)
/** CPU models types
*
diff --git a/src/common/cpuinfo/CpuModel.cpp b/src/common/cpuinfo/CpuModel.cpp
index 6382ffd5b4..d6d91df133 100644
--- a/src/common/cpuinfo/CpuModel.cpp
+++ b/src/common/cpuinfo/CpuModel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -54,6 +54,7 @@ bool model_supports_fp16(CpuModel model)
case CpuModel::X1:
case CpuModel::V1:
case CpuModel::A64FX:
+ case CpuModel::N1:
return true;
default:
return false;
@@ -69,6 +70,7 @@ bool model_supports_dot(CpuModel model)
case CpuModel::A510:
case CpuModel::X1:
case CpuModel::V1:
+ case CpuModel::N1:
return true;
default:
return false;
@@ -116,9 +118,11 @@ CpuModel midr_to_model(uint32_t midr)
model = CpuModel::GENERIC_FP16;
}
break;
+ case 0xd0c: // N1
+ model = CpuModel::N1;
+ break;
case 0xd06: // A65
case 0xd0b: // A76
- case 0xd0c: // N1
case 0xd0d: // A77
case 0xd0e: // A76AE
case 0xd41: // A78
diff --git a/src/cpu/kernels/CpuAddKernel.cpp b/src/cpu/kernels/CpuAddKernel.cpp
index 1648a46cdc..ec210a4a71 100644
--- a/src/cpu/kernels/CpuAddKernel.cpp
+++ b/src/cpu/kernels/CpuAddKernel.cpp
@@ -33,6 +33,11 @@
#include "src/cpu/kernels/add/list.h"
#include <array>
+namespace
+{
+ static constexpr size_t default_mws_N1_fp32_neon = 24536;
+ static constexpr size_t default_mws_V1_fp32_neon = 40510;
+}
namespace arm_compute
{
namespace cpu
@@ -267,8 +272,41 @@ const std::vector<CpuAddKernel::AddKernel> &CpuAddKernel::get_available_kernels(
size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
{
ARM_COMPUTE_UNUSED(thread_count);
- ARM_COMPUTE_UNUSED(platform);
+#if defined(ENABLE_FP32_KERNELS)
+ if(this->_run_method == &add_fp32_neon)
+ {
+ size_t mws = ICPPKernel::default_mws;
+ if(platform.get_cpu_model() == CPUModel::N1)
+ {
+ mws = default_mws_N1_fp32_neon;
+ }
+ else if(platform.get_cpu_model() == CPUModel::V1)
+ {
+ mws = default_mws_V1_fp32_neon;
+ }
+ else
+ {
+ return ICPPKernel::default_mws;
+ }
+
+ // tensor is 1D or was re-interpreted as 1D
+ if(this->window().shape().num_dimensions() == 1)
+ {
+ return mws;
+ }
+ else
+ {
+ // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
+ // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
+ // but the other sizes are large, which boosts performance.
+ mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
+ return std::max(static_cast<size_t>(1), mws);
+ }
+ }
+#else /* ENABLE_FP32_KERNELS */
+ ARM_COMPUTE_UNUSED(platform);
+#endif /* ENABLE_FP32_KERNELS */
return ICPPKernel::default_mws;
}
diff --git a/src/cpu/kernels/CpuAddKernel.h b/src/cpu/kernels/CpuAddKernel.h
index e2062c8c33..9921feabe2 100644
--- a/src/cpu/kernels/CpuAddKernel.h
+++ b/src/cpu/kernels/CpuAddKernel.h
@@ -85,7 +85,7 @@ public:
* @param[in] platform The CPU platform used to create the context.
* @param[in] thread_count Number of threads in the execution.
*
- * @return[out] small_network_mws Minimum workload size for requsted configuration.
+ * @return[out] mws Minimum workload size for requested configuration.
*/
size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
diff --git a/src/cpu/kernels/CpuElementwiseKernel.cpp b/src/cpu/kernels/CpuElementwiseKernel.cpp
index 4b285fc2be..e76b05f296 100644
--- a/src/cpu/kernels/CpuElementwiseKernel.cpp
+++ b/src/cpu/kernels/CpuElementwiseKernel.cpp
@@ -32,6 +32,14 @@
#include <arm_neon.h>
+namespace
+{
+ static constexpr size_t default_min_max_mws_N1_fp32_neon = 25308;
+ static constexpr size_t default_min_max_mws_V1_fp32_neon = 34772;
+ static constexpr size_t default_div_mws_N1_fp32_neon = 19043;
+ static constexpr size_t default_div_mws_V1_fp32_neon = 25511;
+}
+
namespace arm_compute
{
namespace cpu
@@ -401,6 +409,48 @@ Status CpuArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *
return Status{};
}
+size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+ ARM_COMPUTE_UNUSED(thread_count);
+
+#if defined(ENABLE_FP32_KERNELS)
+ if(this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MIN>
+ || this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MAX>)
+ {
+ size_t mws = ICPPKernel::default_mws;
+ if(platform.get_cpu_model() == CPUModel::N1)
+ {
+ mws = default_min_max_mws_N1_fp32_neon;
+ }
+ else if(platform.get_cpu_model() == CPUModel::V1)
+ {
+ mws = default_min_max_mws_V1_fp32_neon;
+ }
+ else
+ {
+ return ICPPKernel::default_mws;
+ }
+
+ // tensor is 1D or was re-interpreted as 1D
+ if(this->window().shape().num_dimensions() == 1)
+ {
+ return mws;
+ }
+ else
+ {
+ // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
+ // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
+ // but the other sizes are large, which boosts performance.
+ mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
+ return std::max(static_cast<size_t>(1), mws);
+ }
+ }
+#else /* ENABLE_FP32_KERNELS */
+ ARM_COMPUTE_UNUSED(platform);
+#endif /* ENABLE_FP32_KERNELS */
+ return ICPPKernel::default_mws;
+}
+
/** The division operator */
void CpuDivisionKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
@@ -410,6 +460,47 @@ void CpuDivisionKernel::configure(const ITensorInfo *src0, const ITensorInfo *sr
CpuArithmeticKernel::configure_common(src0, src1, dst);
}
+size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+ ARM_COMPUTE_UNUSED(thread_count);
+
+#if defined(ENABLE_FP32_KERNELS)
+ if(this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::DIV>)
+ {
+ size_t mws = ICPPKernel::default_mws;
+ if(platform.get_cpu_model() == CPUModel::N1)
+ {
+ mws = default_div_mws_N1_fp32_neon;
+ }
+ else if(platform.get_cpu_model() == CPUModel::V1)
+ {
+ mws = default_div_mws_V1_fp32_neon;
+ }
+ else
+ {
+ return ICPPKernel::default_mws;
+ }
+
+ // tensor is 1D or was re-interpreted as 1D
+ if(this->window().shape().num_dimensions() == 1)
+ {
+ return mws;
+ }
+ else
+ {
+ // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
+ // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
+ // but the other sizes are large, which boosts performance.
+ mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
+ return std::max(static_cast<size_t>(1), mws);
+ }
+ }
+#else /* ENABLE_FP32_KERNELS */
+ ARM_COMPUTE_UNUSED(platform);
+#endif /* ENABLE_FP32_KERNELS */
+ return ICPPKernel::default_mws;
+}
+
Status CpuDivisionKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::S32, DataType::F16, DataType::F32);
diff --git a/src/cpu/kernels/CpuElementwiseKernel.h b/src/cpu/kernels/CpuElementwiseKernel.h
index 2785e0a44c..634e38bf9f 100644
--- a/src/cpu/kernels/CpuElementwiseKernel.h
+++ b/src/cpu/kernels/CpuElementwiseKernel.h
@@ -72,8 +72,8 @@ protected:
static Status validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
protected:
- std::function<ElementwiseFunction> _run_method{ nullptr };
- std::string _name{};
+ ElementwiseKernelPtr _run_method{ nullptr };
+ std::string _name{};
};
class CpuArithmeticKernel : public CpuElementwiseKernel<CpuArithmeticKernel>
@@ -100,6 +100,15 @@ public:
static const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> &get_available_kernels();
+ /** Return minimum workload size of the relevant kernel
+ *
+ * @param[in] platform The CPU platform used to create the context.
+ * @param[in] thread_count Number of threads in the execution.
+ *
+ * @return[out] mws Minimum workload size for requested configuration.
+ */
+ size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
protected:
/** Commmon configure function for element-wise operators with no additional options (e.g. Min, Max, SquaredDiff)
*/
@@ -108,16 +117,6 @@ protected:
static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
ArithmeticOperation _op{};
-
-private:
- /** Function to get the micro kernel implementation
- *
- * @param[in] src0 First input tensor information
- * @param[in] src1 Second input tensor information
- * @param[in] dst Output tensor information
- *
- * @return the function instance for the micro kernel
- */
};
class CpuDivisionKernel : public CpuArithmeticKernel
@@ -141,6 +140,15 @@ public:
*/
static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+ /** Return minimum workload size of the relevant kernel
+ *
+ * @param[in] platform The CPU platform used to create the context.
+ * @param[in] thread_count Number of threads in the execution.
+ *
+ * @return[out] mws Minimum workload size for requested configuration.
+ */
+ size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
protected:
// Inherited methods overridden:
static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp
index 82e5445321..81bb85c3dd 100644
--- a/src/cpu/kernels/CpuMulKernel.cpp
+++ b/src/cpu/kernels/CpuMulKernel.cpp
@@ -34,6 +34,11 @@
#include <arm_neon.h>
+namespace
+{
+ static constexpr size_t default_mws_N1_fp32_neon = 22447;
+ static constexpr size_t default_mws_V1_fp32_neon = 38982;
+}
namespace arm_compute
{
namespace cpu
@@ -1909,6 +1914,47 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
ICpuKernel::configure(win);
}
+size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+ ARM_COMPUTE_UNUSED(thread_count);
+
+#if defined(ENABLE_FP32_KERNELS)
+ if(this->_func_float == &mul_F32_F32_F32)
+ {
+ size_t mws = ICPPKernel::default_mws;
+ if(platform.get_cpu_model() == CPUModel::N1)
+ {
+ mws = default_mws_N1_fp32_neon;
+ }
+ else if(platform.get_cpu_model() == CPUModel::V1)
+ {
+ mws = default_mws_V1_fp32_neon;
+ }
+ else
+ {
+ return ICPPKernel::default_mws;
+ }
+
+ // tensor is 1D or was re-interpreted as 1D
+ if(this->window().shape().num_dimensions() == 1)
+ {
+ return mws;
+ }
+ else
+ {
+ // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
+ // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
+ // but the other sizes are large, which boosts performance.
+ mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
+ return std::max(static_cast<size_t>(1), mws);
+ }
+ }
+#else /* ENABLE_FP32_KERNELS */
+ ARM_COMPUTE_UNUSED(platform);
+#endif /* ENABLE_FP32_KERNELS */
+ return ICPPKernel::default_mws;
+}
+
Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy,
RoundingPolicy rounding_policy)
{
diff --git a/src/cpu/kernels/CpuMulKernel.h b/src/cpu/kernels/CpuMulKernel.h
index c92e1efdf4..73ffc0dd2b 100644
--- a/src/cpu/kernels/CpuMulKernel.h
+++ b/src/cpu/kernels/CpuMulKernel.h
@@ -81,6 +81,15 @@ public:
const char *name() const override;
size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+ /** Return minimum workload size of the relevant kernel
+ *
+ * @param[in] platform The CPU platform used to create the context.
+ * @param[in] thread_count Number of threads in the execution.
+ *
+ * @return[out] mws Minimum workload size for requested configuration.
+ */
+ size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
/** Get the preferred dimension in which the scheduler splits the work into multiple jobs.
*
* @return The split dimension hint.
diff --git a/src/cpu/kernels/CpuSubKernel.cpp b/src/cpu/kernels/CpuSubKernel.cpp
index d908e4ed28..ad74dda85d 100644
--- a/src/cpu/kernels/CpuSubKernel.cpp
+++ b/src/cpu/kernels/CpuSubKernel.cpp
@@ -31,6 +31,11 @@
#include "src/core/helpers/WindowHelpers.h"
#include "src/cpu/kernels/sub/neon/list.h"
+namespace
+{
+ static constexpr size_t default_mws_N1_fp32_neon = 24385;
+ static constexpr size_t default_mws_V1_fp32_neon = 40520;
+}
namespace arm_compute
{
namespace cpu
@@ -137,6 +142,47 @@ void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I
ICpuKernel::configure(win);
}
+size_t CpuSubKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
+{
+ ARM_COMPUTE_UNUSED(thread_count);
+
+#if defined(ENABLE_FP32_KERNELS)
+ if(this->_run_method == &sub_same_neon<float>)
+ {
+ size_t mws = ICPPKernel::default_mws;
+ if(platform.get_cpu_model() == CPUModel::N1)
+ {
+ mws = default_mws_N1_fp32_neon;
+ }
+ else if(platform.get_cpu_model() == CPUModel::V1)
+ {
+ mws = default_mws_V1_fp32_neon;
+ }
+ else
+ {
+ return ICPPKernel::default_mws;
+ }
+
+ // tensor is 1D or was re-interpreted as 1D
+ if(this->window().shape().num_dimensions() == 1)
+ {
+ return mws;
+ }
+ else
+ {
+ // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
+ // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
+ // but the other sizes are large, which boosts performance.
+ mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
+ return std::max(static_cast<size_t>(1), mws);
+ }
+ }
+#else /* ENABLE_FP32_KERNELS */
+ ARM_COMPUTE_UNUSED(platform);
+#endif /* ENABLE_FP32_KERNELS */
+ return ICPPKernel::default_mws;
+}
+
Status CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
diff --git a/src/cpu/kernels/CpuSubKernel.h b/src/cpu/kernels/CpuSubKernel.h
index e835bac3d5..3d80b34279 100644
--- a/src/cpu/kernels/CpuSubKernel.h
+++ b/src/cpu/kernels/CpuSubKernel.h
@@ -73,6 +73,15 @@ public:
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
+ /** Return minimum workload size of the relevant kernel
+ *
+ * @param[in] platform The CPU platform used to create the context.
+ * @param[in] thread_count Number of threads in the execution.
+ *
+ * @return[out] mws Minimum workload size for requested configuration.
+ */
+ size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
+
struct SubKernel
{
const char *name;