aboutsummaryrefslogtreecommitdiff
path: root/src/core/cpu
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2021-06-25 05:42:57 +0100
committerGeorgios Pinitas <georgios.pinitas@arm.com>2021-06-29 12:38:33 +0000
commit5fdde99f4271891a40c02cd1e89f1344aa84583a (patch)
tree35944b8bb0eee6aa9bbca08c38325f10cf66370c /src/core/cpu
parent4a95bba6ca61ce99995ece6fd237b5498c9f322c (diff)
downloadComputeLibrary-5fdde99f4271891a40c02cd1e89f1344aa84583a.tar.gz
Improve selection speed of CPU implementations
CPU micro-kernel to be used was picked during kernel execution. Move selection during configuration to reduce runtime overhead. Standardize kernel names as follows: <simd_tech>_<data_type>_<data_layout>_<kernel_name> e.g. sve_fp32_nhwc_scale Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: I544f1c08c8fef0f130a3bde61882ccb9a1f47f21 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5855 Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/cpu')
-rw-r--r--src/core/cpu/kernels/CpuActivationKernel.cpp35
-rw-r--r--src/core/cpu/kernels/CpuActivationKernel.h12
-rw-r--r--src/core/cpu/kernels/CpuAddKernel.cpp59
-rw-r--r--src/core/cpu/kernels/CpuAddKernel.h12
-rw-r--r--src/core/cpu/kernels/CpuElementwiseKernel.cpp103
-rw-r--r--src/core/cpu/kernels/CpuElementwiseKernel.h62
-rw-r--r--src/core/cpu/kernels/CpuFloorKernel.cpp26
-rw-r--r--src/core/cpu/kernels/CpuFloorKernel.h12
-rw-r--r--src/core/cpu/kernels/CpuPool2dKernel.cpp48
-rw-r--r--src/core/cpu/kernels/CpuPool2dKernel.h5
-rw-r--r--src/core/cpu/kernels/CpuScaleKernel.cpp41
-rw-r--r--src/core/cpu/kernels/CpuScaleKernel.h32
-rw-r--r--src/core/cpu/kernels/CpuSoftmaxKernel.cpp84
-rw-r--r--src/core/cpu/kernels/CpuSoftmaxKernel.h32
-rw-r--r--src/core/cpu/kernels/CpuSubKernel.cpp36
-rw-r--r--src/core/cpu/kernels/CpuSubKernel.h25
16 files changed, 315 insertions, 309 deletions
diff --git a/src/core/cpu/kernels/CpuActivationKernel.cpp b/src/core/cpu/kernels/CpuActivationKernel.cpp
index 24642f1efb..dad2ecfc5b 100644
--- a/src/core/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/core/cpu/kernels/CpuActivationKernel.cpp
@@ -63,57 +63,57 @@ static const ActivationKernel available_kernels[] =
{
#if defined(ARM_COMPUTE_ENABLE_SVE)
{
- "fp16_sve_activation",
+ "sve_fp16_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_activation)
},
{
- "fp32_sve_activation",
+ "sve_fp32_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_activation)
},
#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
#if defined(ARM_COMPUTE_ENABLE_NEON)
{
- "fp16_neon_activation",
+ "neon_fp16_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::F16; },
REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_activation)
},
{
- "fp32_neon_activation",
+ "neon_fp32_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::F32; },
REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_activation)
},
#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
#if defined(ARM_COMPUTE_ENABLE_SVE2)
{
- "qasymm8_sve_activation",
+ "sve_qu8_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); },
REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_activation)
},
{
- "qasymm8_signed_sve_activation",
+ "sve_qs8_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); },
REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_activation)
},
{
- "qsymm16_sve_activation",
+ "sve_qs16_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16 && data.ci.has_sve2(); },
REGISTER_QSYMM16_SVE(arm_compute::cpu::qsymm16_sve_activation)
},
#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
{
- "qasymm8_neon_activation",
+ "neon_qu8_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; },
REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_activation)
},
{
- "qasymm8_signed_neon_activation",
+ "neon_qs8_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_activation)
},
{
- "qsymm16_neon_activation",
+ "neon_qs16_activation",
[](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; },
REGISTER_QSYMM16_NEON(arm_compute::cpu::qsymm16_neon_activation)
},
@@ -206,10 +206,14 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src,
void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info));
- _act_info = activation_info;
+ const auto uk = get_implementation(ActivationSelectorData{ src->data_type(), CPUInfo::get() });
+ ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info));
+ _act_info = activation_info;
+ _run_method = uk->ukernel;
+ _name = std::string("CpuActivationKernel").append("/").append(uk->name);
// Configure kernel window
auto win_config = validate_and_configure_window(src, dst);
@@ -239,18 +243,17 @@ void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, con
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
ARM_COMPUTE_ERROR_ON(tensors.empty());
+ ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
- const auto *uk = get_implementation(ActivationSelectorData{ src->info()->data_type(), CPUInfo::get() });
-
- uk->ukernel(src, dst, _act_info, window);
+ _run_method(src, dst, _act_info, window);
}
const char *CpuActivationKernel::name() const
{
- return "CpuActivationKernel";
+ return _name.c_str();
}
} // namespace kernels
} // namespace cpu
diff --git a/src/core/cpu/kernels/CpuActivationKernel.h b/src/core/cpu/kernels/CpuActivationKernel.h
index de71014303..37650345fe 100644
--- a/src/core/cpu/kernels/CpuActivationKernel.h
+++ b/src/core/cpu/kernels/CpuActivationKernel.h
@@ -49,12 +49,9 @@ public:
* @param[in] activation_info Activation layer information.
*/
void configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuActivationKernel
+ /** Static function to check if given info will lead to a valid configuration
*
- * @param[in] src Source tensor info. In case of @p dst tensor info = nullptr, this tensor will store the result
- * of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
- * @param[in] dst Destination tensor info. Data type supported: same as @p src
- * @param[in] act_info Activation layer information.
+ * Similar to CpuActivationKernel::configure()
*
* @return a status
*/
@@ -65,7 +62,12 @@ public:
const char *name() const override;
private:
+ using ActivationKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
+
+private:
ActivationLayerInfo _act_info{};
+ ActivationKernelPtr _run_method{ nullptr };
+ std::string _name{};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/core/cpu/kernels/CpuAddKernel.cpp b/src/core/cpu/kernels/CpuAddKernel.cpp
index 8d74b4027b..12766037a7 100644
--- a/src/core/cpu/kernels/CpuAddKernel.cpp
+++ b/src/core/cpu/kernels/CpuAddKernel.cpp
@@ -69,7 +69,7 @@ static const AddKernel available_kernels[] =
{
#if defined(ARM_COMPUTE_ENABLE_SVE2)
{
- "add_qasymm8_sve",
+ "sve2_qu8_add",
[](const AddSelectorData & data)
{
return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8)) && data.ci.has_sve();
@@ -77,7 +77,7 @@ static const AddKernel available_kernels[] =
REGISTER_QASYMM8_SVE(arm_compute::cpu::add_qasymm8_sve)
},
{
- "add_qasymm8_signed_sve",
+ "sve2_qs8_add",
[](const AddSelectorData & data)
{
return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8_SIGNED)) && data.ci.has_sve();
@@ -85,7 +85,7 @@ static const AddKernel available_kernels[] =
REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::add_qasymm8_signed_sve)
},
{
- "add_qsymm16_sve",
+ "sve2_qs16_add",
[](const AddSelectorData & data)
{
return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)) && data.ci.has_sve();
@@ -95,7 +95,7 @@ static const AddKernel available_kernels[] =
#endif /* !defined(ARM_COMPUTE_ENABLE_SVE2) */
#if defined(ARM_COMPUTE_ENABLE_SVE)
{
- "add_same_sve",
+ "sve_fp32_add",
[](const AddSelectorData & data)
{
return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)) && data.ci.has_sve();
@@ -103,7 +103,7 @@ static const AddKernel available_kernels[] =
REGISTER_FP32_SVE(arm_compute::cpu::add_same_sve<float>)
},
{
- "add_same_sve",
+ "sve_fp16_add",
[](const AddSelectorData & data)
{
return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F16)) && data.ci.has_sve();
@@ -111,7 +111,7 @@ static const AddKernel available_kernels[] =
REGISTER_FP16_SVE(arm_compute::cpu::add_same_sve<float16_t>)
},
{
- "add_same_sve",
+ "sve_u8_add",
[](const AddSelectorData & data)
{
return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::U8)) && data.ci.has_sve();
@@ -119,7 +119,7 @@ static const AddKernel available_kernels[] =
REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<uint8_t>)
},
{
- "add_same_sve",
+ "sve_s16_add",
[](const AddSelectorData & data)
{
return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S16)) && data.ci.has_sve();
@@ -127,7 +127,7 @@ static const AddKernel available_kernels[] =
REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<int16_t>)
},
{
- "add_same_sve",
+ "sve_s32_add",
[](const AddSelectorData & data)
{
return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S32)) && data.ci.has_sve();
@@ -135,7 +135,7 @@ static const AddKernel available_kernels[] =
REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<int32_t>)
},
{
- "add_u8_s16_s16_sve",
+ "sve_u8_s16_s16_add",
[](const AddSelectorData & data)
{
return ((data.dt1 == DataType::U8) && (data.dt2 == DataType::S16)) && data.ci.has_sve();
@@ -143,7 +143,7 @@ static const AddKernel available_kernels[] =
REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_s16_s16_sve)
},
{
- "add_s16_u8_s16_sve",
+ "sve_s16_u8_s16_add",
[](const AddSelectorData & data)
{
return ((data.dt1 == DataType::S16) && (data.dt2 == DataType::U8)) && data.ci.has_sve();
@@ -151,7 +151,7 @@ static const AddKernel available_kernels[] =
REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_u8_s16_sve)
},
{
- "add_u8_u8_s16_sve",
+ "sve_u8_u8_s16_add",
[](const AddSelectorData & data)
{
return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)) && data.ci.has_sve();
@@ -161,13 +161,13 @@ static const AddKernel available_kernels[] =
#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
#if defined(ARM_COMPUTE_ENABLE_NEON)
{
- "add_same_neon",
+ "neon_fp32_add",
[](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); },
REGISTER_FP32_NEON(arm_compute::cpu::add_same_neon<float>)
},
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
{
- "add_same_neon",
+ "neon_fp16_add",
[](const AddSelectorData & data)
{
return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F16)) && data.ci.has_fp16();
@@ -176,49 +176,49 @@ static const AddKernel available_kernels[] =
},
#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
{
- "add_same_neon",
+ "neon_u8_add",
[](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::U8)); },
REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<uint8_t>)
},
{
- "add_same_neon",
+ "neon_s16_add",
[](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S16)); },
REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<int16_t>)
},
{
- "add_same_neon",
+ "neon_s32_add",
[](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S32)); },
REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<int32_t>)
},
{
- "add_u8_s16_s16_neon",
+ "neon_u8_s16_s16_add",
[](const AddSelectorData & data) { return ((data.dt1 == DataType::U8) && (data.dt2 == DataType::S16)); },
REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_s16_s16_neon)
},
{
- "add_s16_u8_s16_neon",
+ "neon_s16_u8_s16_add",
[](const AddSelectorData & data) { return ((data.dt1 == DataType::S16) && (data.dt2 == DataType::U8)); },
REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_u8_s16_neon)
},
{
- "add_u8_u8_s16_neon",
+ "neon_u8_u8_s16_add",
[](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); },
REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_u8_s16_neon)
},
#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
#if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE)
{
- "add_qasymm8_neon",
+ "neon_qu8_add",
[](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8)); },
REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon)
},
{
- "add_qasymm8_signed_neon",
+ "neon_qs8_add",
[](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8_SIGNED)); },
REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon)
},
{
- "add_qsymm16_neon",
+ "neon_qs16_add",
[](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)); },
REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon)
},
@@ -339,7 +339,12 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I
ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy));
- _policy = policy;
+ const auto uk = get_implementation(CPUInfo::get(), src0->data_type(), src1->data_type(), dst->data_type());
+ ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
+
+ _policy = policy;
+ _run_method = uk->ukernel;
+ _name = std::string("CpuAddKernel").append("/").append(uk->name);
// Configure kernel window
auto win_config = validate_and_configure_window(*src0, *src1, *dst);
@@ -364,20 +369,18 @@ void CpuAddKernel::run_op(ITensorPack &tensors, const Window &window, const Thre
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
ARM_COMPUTE_ERROR_ON(tensors.empty());
+ ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
- const auto *uk = get_implementation(CPUInfo::get(), src0->info()->data_type(), src1->info()->data_type(), dst->info()->data_type());
- ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
- uk->ukernel(src0, src1, dst, _policy, window);
+ _run_method(src0, src1, dst, _policy, window);
}
const char *CpuAddKernel::name() const
{
- return "CpuAddKernel";
+ return _name.c_str();
}
} // namespace kernels
} // namespace cpu
diff --git a/src/core/cpu/kernels/CpuAddKernel.h b/src/core/cpu/kernels/CpuAddKernel.h
index a36ec7ad65..3ebaa462ee 100644
--- a/src/core/cpu/kernels/CpuAddKernel.h
+++ b/src/core/cpu/kernels/CpuAddKernel.h
@@ -61,12 +61,9 @@ public:
* @param[in] policy Overflow policy.
*/
void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuAddKernel
+ /** Static function to check if given info will lead to a valid configuration
*
- * @param[in] src0 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
- * @param[in] src1 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
- * @param[in] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
- * @param[in] policy Overflow policy.
+ * Similar to CpuAddKernel::configure()
*
* @return a status
*/
@@ -77,7 +74,12 @@ public:
const char *name() const override;
private:
+ using AddKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
+
+private:
ConvertPolicy _policy{};
+ AddKernelPtr _run_method{ nullptr };
+ std::string _name{};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/core/cpu/kernels/CpuElementwiseKernel.cpp b/src/core/cpu/kernels/CpuElementwiseKernel.cpp
index dc0c5b210d..dc574fce65 100644
--- a/src/core/cpu/kernels/CpuElementwiseKernel.cpp
+++ b/src/core/cpu/kernels/CpuElementwiseKernel.cpp
@@ -58,69 +58,68 @@ struct ElementwiseKernel
UKernelType *ukernel;
};
-template <ArithmeticOperation op>
-std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
-configure_arithm_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+template <ArithmeticOperation op>
+CpuElementwiseKernel::UKernelInfo configure_arithm_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
{
ARM_COMPUTE_UNUSED(src1, dst);
static ElementwiseKernel kernels[] =
{
#if defined(ARM_COMPUTE_ENABLE_SVE)
{
- "sve_elementwise_fp32",
+ "sve_fp32_elementwise",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
REGISTER_FP32_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float32_t>))
},
{
- "sve_elementwise_s32",
+ "sve_s32_elementwise",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::S32 && data.ci.has_sve(); },
REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int32_t>))
},
{
- "sve_elementwise_s16",
+ "sve_s16_elementwise",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); },
REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int16_t>))
},
#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
#if defined(ARM_COMPUTE_ENABLE_NEON)
{
- "neon_elementwise_f32",
+ "neon_fp32_elementwise",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::F32; },
REGISTER_FP32_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float, 4>>))
},
{
- "neon_elementwise_s32",
+ "neon_s32_elementwise",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::S32; },
REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int32_t, 4>>))
},
#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
#if defined(ARM_COMPUTE_ENABLE_SVE2)
{
- "sve2_elementwise_qu8",
+ "sve2_qu8_elementwise",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); },
REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, uint8_t>))
},
{
- "sve2_elementwise_qs8",
+ "sve2_qs8_elementwise",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); },
REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, int8_t>))
},
#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
#if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE)
{
- "neon_elementwise_qu8",
+ "neon_qu8_elementwise",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8; },
REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_arithm_op_quantized<op>))
},
{
- "neon_elementwise_qs8",
+ "neon_qs8_elementwise",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_arithm_op_quantized_signed<op>))
},
#endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */
#if defined(ARM_COMPUTE_ENABLE_SVE)
{
- "sve_elementwise_f16",
+ "sve_fp16_elementwise",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
REGISTER_FP16_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float16_t>))
},
@@ -128,13 +127,13 @@ configure_arithm_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorI
#if defined(ARM_COMPUTE_ENABLE_NEON)
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
{
- "neon_elementwise_f16",
+ "neon_fp16_elementwise",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); },
REGISTER_FP16_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float16_t, 8>>))
},
#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
{
- "neon_elementwise_s16",
+ "neon_s16_elementwise",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; },
REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int16_t, 8>>))
},
@@ -145,98 +144,97 @@ configure_arithm_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorI
{
if(uk.is_selected({ src0->data_type(), CPUInfo::get() }))
{
- return uk.ukernel;
+ return { uk.name, uk.ukernel };
}
}
- return nullptr;
+ return { "", nullptr };
}
-template <ComparisonOperation op>
-std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
-configure_comp_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+template <ComparisonOperation op>
+CpuElementwiseKernel::UKernelInfo configure_comp_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
{
ARM_COMPUTE_UNUSED(src1, dst);
static ElementwiseKernel kernels[] =
{
#if defined(ARM_COMPUTE_ENABLE_SVE)
{
- "sve_comparison_u8",
+ "sve_u8_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::U8 && data.ci.has_sve(); },
REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, uint8_t>))
},
{
- "sve_comparison_f32",
+ "sve_fp32_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
REGISTER_FP32_SVE((arm_compute::cpu::elementwise_comparison_op<op, float>))
},
{
- "sve_comparison_s16",
+ "sve_s16_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); },
REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int16_t>))
},
{
- "sve_comparison_s32",
+ "sve_s32_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::S32 && data.ci.has_sve(); },
REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int32_t>))
},
#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
#if defined(ARM_COMPUTE_ENABLE_NEON)
{
- "neon_comparison_u8",
+ "neon_u8_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::U8; },
REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_8<op, uint8_t, uint8x16_t>))
},
{
- "neon_comparison_f32",
+ "neon_fp32_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::F32; },
REGISTER_FP32_NEON((arm_compute::cpu::elementwise_comp_op_32<op, float, float32x4_t>))
},
{
- "neon_comparison_s16",
+ "neon_s16_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; },
REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_16<op, int16_t, int16x8_t>))
},
{
- "neon_comparison_s32",
+ "neon_s32_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::S32; },
REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_32<op, int32_t, int32x4_t>))
},
#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
#if defined(ARM_COMPUTE_ENABLE_SVE2)
{
- "sve_comparison_qu8",
+ "sve2_qu8_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); },
REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, uint8_t>))
},
{
- "sve_comparison_qs8",
+ "sve2_qs8_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); },
REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, int8_t>))
},
#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
#if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE)
{
- "neon_comparison_qu8",
+ "neon_qu8_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8; },
REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_comp_op_quantized<op>))
},
{
- "neon_comparison_qs8",
+ "neon_qs8_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_comp_op_quantized_signed<op>))
},
#endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */
#if defined(ARM_COMPUTE_ENABLE_SVE)
{
- "sve_comparison_f16",
+ "sve_fp16_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
REGISTER_FP16_SVE((arm_compute::cpu::elementwise_comparison_op<op, float16_t>))
},
#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
#if defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
{
- "neon_comparison_f16",
+ "neon_fp16_comparison",
[](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); },
REGISTER_FP16_NEON((arm_compute::cpu::elementwise_comp_op_16<op, float16_t, float16x8_t>))
},
@@ -247,11 +245,11 @@ configure_comp_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInf
{
if(uk.is_selected({ src0->data_type(), CPUInfo::get() }))
{
- return uk.ukernel;
+ return { uk.name, uk.ukernel };
}
}
- return nullptr;
+ return { "", nullptr };
}
} // namespace
@@ -278,6 +276,11 @@ void CpuElementwiseKernel::configure_common(const ITensorInfo *src0, const ITens
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+ const auto uk = get_implementation(src0, src1, dst);
+
+ _run_method = uk.ukernel;
+ _name = std::string("CpuElementwiseKernel").append("/").append(uk.name);
+
// If any of shapes is dynamic, expect a configured window and dst at run-time.
if(src0->is_dynamic() || src1->is_dynamic())
{
@@ -292,22 +295,26 @@ void CpuElementwiseKernel::configure_common(const ITensorInfo *src0, const ITens
void CpuElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
auto dst = tensors.get_tensor(TensorType::ACL_DST);
- auto function = get_implementation(src0->info(), src1->info(), dst->info());
- ARM_COMPUTE_ERROR_ON(function == nullptr);
- function(src0, src1, dst, window);
+ _run_method(src0, src1, dst, window);
+}
+
+const char *CpuElementwiseKernel::name() const
+{
+ return _name.c_str();
}
/** Arithmetic operators (min, max, squared_diff) */
void CpuArithmeticKernel::configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
{
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
- configure_common(src0, src1, dst);
_op = op;
+ configure_common(src0, src1, dst);
}
Status CpuArithmeticKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
@@ -329,8 +336,7 @@ Status CpuArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *
return Status{};
}
-std::function<CpuElementwiseKernel::ElementwiseFunction>
-CpuArithmeticKernel::get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+CpuElementwiseKernel::UKernelInfo CpuArithmeticKernel::get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
{
switch(_op)
{
@@ -349,7 +355,7 @@ CpuArithmeticKernel::get_implementation(const ITensorInfo *src0, const ITensorIn
default:
ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
}
- return nullptr;
+ return { "", nullptr };
}
/** The division operator */
@@ -357,8 +363,8 @@ CpuArithmeticKernel::get_implementation(const ITensorInfo *src0, const ITensorIn
void CpuDivisionKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
{
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
- configure_common(src0, src1, dst);
_op = ArithmeticOperation::DIV;
+ configure_common(src0, src1, dst);
}
Status CpuDivisionKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
@@ -378,8 +384,8 @@ Status CpuDivisionKernel::validate(const ITensorInfo *src0, const ITensorInfo *s
void CpuPowerKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
{
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
- configure_common(src0, src1, dst);
_op = ArithmeticOperation::POWER;
+ configure_common(src0, src1, dst);
}
Status CpuPowerKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
@@ -399,8 +405,8 @@ Status CpuPowerKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1
void CpuComparisonKernel::configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
{
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
- configure_common(src0, src1, dst);
_op = op;
+ configure_common(src0, src1, dst);
}
Status CpuComparisonKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
@@ -422,8 +428,7 @@ Status CpuComparisonKernel::validate(ComparisonOperation op, const ITensorInfo *
return Status{};
}
-std::function<CpuElementwiseKernel::ElementwiseFunction>
-CpuComparisonKernel::get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+CpuElementwiseKernel::UKernelInfo CpuComparisonKernel::get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
{
switch(_op)
{
@@ -442,7 +447,7 @@ CpuComparisonKernel::get_implementation(const ITensorInfo *src0, const ITensorIn
default:
ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
}
- return nullptr;
+ return { "", nullptr };
}
} // namespace kernels
} // namespace cpu
diff --git a/src/core/cpu/kernels/CpuElementwiseKernel.h b/src/core/cpu/kernels/CpuElementwiseKernel.h
index 952c6e3e25..50c8d29ac5 100644
--- a/src/core/cpu/kernels/CpuElementwiseKernel.h
+++ b/src/core/cpu/kernels/CpuElementwiseKernel.h
@@ -43,25 +43,19 @@ namespace kernels
class CpuElementwiseKernel : public ICpuKernel
{
public:
- const char *name() const override
- {
- return "CpuElementwiseKernel";
- }
-
CpuElementwiseKernel() = default;
ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuElementwiseKernel);
- /** Common signature for all the specialised arithmetic functions
- *
- * @param[in] src0 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
- * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[out] dst Output tensor info. Data types supported: Dependent on subclass.
- * @param[in] window Region on which to execute the kernel.
- */
using ElementwiseFunction = void(const ITensor *, const ITensor *, ITensor *, const Window &);
+ struct UKernelInfo
+ {
+ std::string name;
+ std::function<ElementwiseFunction> ukernel;
+ };
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ const char *name() const override;
protected:
/** Validate the argument passed to the kernel
@@ -85,7 +79,11 @@ protected:
*
* @return the function instance for the micro kernel
*/
- virtual std::function<ElementwiseFunction> get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) = 0;
+ virtual UKernelInfo get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) = 0;
+
+protected:
+ std::function<ElementwiseFunction> _run_method{ nullptr };
+ std::string _name{};
};
class CpuArithmeticKernel : public CpuElementwiseKernel
@@ -103,14 +101,11 @@ public:
*/
void configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel
+ /** Static function to check if given info will lead to a valid configuration
*
- * @param[in] op Arithmetic operation to be executed.
- * @param[in] src0 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
- * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[in] dst Output tensor info. Data types supported: Same as @p src0.
+ * Similar to CpuArithmeticKernel::configure()
*
- * @return a Status
+ * @return a status
*/
static Status validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
@@ -129,7 +124,7 @@ private:
*
* @return the function instance for the micro kernel
*/
- std::function<ElementwiseFunction> get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) override;
+ UKernelInfo get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) override;
};
class CpuDivisionKernel : public CpuArithmeticKernel
@@ -146,13 +141,11 @@ public:
*/
void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuDivisionKernel
+ /** Static function to check if given info will lead to a valid configuration
*
- * @param[in] src0 First tensor input info. Data types supported: S32/F16/F32.
- * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[in] dst Output tensor info. Data types supported: Same as @p src0.
+ * Similar to CpuDivisionKernel::configure()
*
- * @return a Status
+ * @return a status
*/
static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
@@ -175,13 +168,11 @@ public:
*/
void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuPowerKernel
+ /** Static function to check if given info will lead to a valid configuration
*
- * @param[in] src0 First tensor input info. Data types supported: F16/F32.
- * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[in] dst Output tensor info. Data types supported: Same as @p src0.
+ * Similar to CpuPowerKernel::configure()
*
- * @return a Status
+ * @return a status
*/
static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
@@ -205,14 +196,11 @@ public:
*/
void configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuComparisonKernel
+ /** Static function to check if given info will lead to a valid configuration
*
- * @param[in] op Comparison operation to be executed.
- * @param[in] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
- * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[in] dst Output tensor info. Data types supported: U8.
+ * Similar to CpuComparisonKernel::configure()
*
- * @return a Status
+ * @return a status
*/
static Status validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
@@ -229,7 +217,7 @@ private:
*
* @return the function instance for the micro kernel
*/
- std::function<ElementwiseFunction> get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) override;
+ UKernelInfo get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) override;
ComparisonOperation _op{};
};
diff --git a/src/core/cpu/kernels/CpuFloorKernel.cpp b/src/core/cpu/kernels/CpuFloorKernel.cpp
index c2e9d48ce9..d41df6a1f5 100644
--- a/src/core/cpu/kernels/CpuFloorKernel.cpp
+++ b/src/core/cpu/kernels/CpuFloorKernel.cpp
@@ -54,18 +54,18 @@ struct FloorUKernel
{
const char *name;
const FloorSelectorPtr is_selected;
- FloorUKernelPtr func;
+ FloorUKernelPtr ukernel;
};
static const FloorUKernel available_kernels[] =
{
{
- "fp16_neon_floor",
+ "neon_fp16_floor",
[](const FloorSelectorData & data) { return data.dt == DataType::F16; },
REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor)
},
{
- "f32_neon_floor",
+ "neon_fp32_floor",
[](const FloorSelectorData & data) { return data.dt == DataType::F32; },
REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor)
},
@@ -94,7 +94,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
const auto *uk = get_implementation(FloorSelectorData{ src->data_type() });
- ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->func == nullptr);
+ ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
// Validate in case of configured output
if(dst->total_size() > 0)
@@ -110,12 +110,15 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
void CpuFloorKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
- // Auto initialize output
auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type());
- // Validate
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
+ const auto *uk = get_implementation(FloorSelectorData{ src->data_type() });
+ ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
+
+ _run_method = uk->ukernel;
+ _name = std::string("CpuFloorKernel").append("/").append(uk->name);
// Configure kernel window
const Window win = calculate_max_window(*src, Steps());
@@ -146,12 +149,11 @@ void CpuFloorKernel::run_op(ITensorPack &tensors, const Window &window, const Th
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
ARM_COMPUTE_ERROR_ON(tensors.empty());
+ ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
-
- const auto len = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start());
- const auto *ukernel = get_implementation(FloorSelectorData{ src->info()->data_type() });
+ const auto len = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start());
Window win{ window };
win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -161,14 +163,14 @@ void CpuFloorKernel::run_op(ITensorPack &tensors, const Window &window, const Th
execute_window_loop(win, [&](const Coordinates &)
{
- ukernel->func(src_it.ptr(), dst_it.ptr(), len);
+ _run_method(src_it.ptr(), dst_it.ptr(), len);
},
src_it, dst_it);
}
const char *CpuFloorKernel::name() const
{
- return "CpuFloorKernel";
+ return _name.c_str();
}
} // namespace kernels
} // namespace cpu
diff --git a/src/core/cpu/kernels/CpuFloorKernel.h b/src/core/cpu/kernels/CpuFloorKernel.h
index 2680871b45..78534d2a1d 100644
--- a/src/core/cpu/kernels/CpuFloorKernel.h
+++ b/src/core/cpu/kernels/CpuFloorKernel.h
@@ -45,10 +45,9 @@ public:
* @param[out] dst Destination tensor. Same as @p src
*/
void configure(const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuFloorKernel
+ /** Static function to check if given info will lead to a valid configuration
*
- * @param[in] src Source tensor info. Data type supported: F16/F32.
- * @param[in] dst Destination tensor info. Same as @p src
+ * Similar to CpuFloorKernel::configure()
*
* @return a status
*/
@@ -65,6 +64,13 @@ public:
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
+
+private:
+ using FloorUKernelPtr = std::add_pointer<void(const void *, void *, int)>::type;
+
+private:
+ FloorUKernelPtr _run_method{ nullptr };
+ std::string _name{};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/core/cpu/kernels/CpuPool2dKernel.cpp b/src/core/cpu/kernels/CpuPool2dKernel.cpp
index bfde2dfa80..27f4b950db 100644
--- a/src/core/cpu/kernels/CpuPool2dKernel.cpp
+++ b/src/core/cpu/kernels/CpuPool2dKernel.cpp
@@ -72,92 +72,92 @@ struct PoolingKernel
static const PoolingKernel available_kernels[] =
{
{
- "poolingMxN_qasymm8_neon_nhwc",
+ "neon_qu8_nhwc_poolMxN",
[](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); },
REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc)
},
{
- "poolingMxN_qasymm8_signed_neon_nhwc",
+ "neon_qs8_nhwc_poolMxN",
[](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); },
REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc)
},
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
{
- "poolingMxN_fp16_neon_nhwc",
+ "neon_f16_nhwc_poolMxN",
[](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)); },
REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc)
},
#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
{
- "poolingMxN_fp32_neon_nhwc",
+ "neon_fp32_nhwc_poolMxN",
[](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); },
REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc)
},
#if defined(ENABLE_NCHW_KERNELS)
{
- "pooling2_qasymm8_neon_nchw",
+ "neon_qu8_nchw_pool2",
[](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<uint8_t>)
},
{
- "pooling3_qasymm8_neon_nchw",
+ "neon_qu8_nchw_pool3",
[](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<uint8_t>)
},
{
- "poolingMxN_qasymm8_neon_nchw",
+ "neon_qu8_nchw_poolMxN",
[](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); },
REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<uint8_t>)
},
{
- "pooling2_qasymm8_signed_neon_nchw",
+ "neon_qs8_nchw_pool2",
[](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<int8_t>)
},
{
- "pooling3_qasymm8_signed_neon_nchw",
+ "neon_qs8_nchw_pool3",
[](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<int8_t>)
},
{
- "poolingMxN_qasymm8_signed_neon_nchw",
+ "neon_qs8_nchw_poolMxN",
[](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); },
REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<int8_t>)
},
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
{
- "pooling2_fp16_neon_nchw",
+ "neon_fp16_nchw_pool2",
[](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw)
},
{
- "pooling3_fp16_neon_nchw",
+ "neon_fp16_nchw_pool3",
[](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw)
},
{
- "poolingMxN_fp16_neon_nchw",
+ "neon_fp16_nchw_poolMxN",
[](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16)); },
REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw)
},
#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
{
- "pooling2_fp32_neon_nchw",
+ "neon_fp32_nchw_pool2",
[](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw)
},
{
- "pooling3_fp32_neon_nchw",
+ "neon_fp32_nchw_pool3",
[](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw)
},
{
- "pooling7_fp32_neon_nchw",
+ "neon_fp32_nchw_pool7",
[](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7)); },
REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw)
},
{
- "poolingMxN_fp32_neon_nchw",
+ "neon_fp32_nchw_poolMxN",
[](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); },
REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw)
},
@@ -398,11 +398,16 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin
// Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices, pool_size));
+ const auto *uk = get_implementation(src->data_type(), src->data_layout(), pad_stride_info.stride().first, pool_size);
+ ARM_COMPUTE_ERROR_ON(uk == nullptr);
+
// Set instance variables
_pool_info = pool_info;
_data_layout = src->data_layout();
_pool_size = pool_size;
_pool_stride_x = pad_stride_info.stride().first;
+ _run_method = uk->ukernel;
+ _name = std::string("CpuPool2dKernel").append("/").append(uk->name);
if(_data_layout == DataLayout::NHWC)
{
@@ -451,6 +456,7 @@ void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const T
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
ITensor *dst = tensors.get_tensor(TensorType::ACL_DST_0);
@@ -498,16 +504,12 @@ void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const T
window_src.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x));
window_src.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y));
}
-
- const auto *uk = get_implementation(src->info()->data_type(), _data_layout, _pool_stride_x, _pool_size);
- ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
- uk->ukernel(src, dst, indices, _pool_info, window_src, window);
+ _run_method(src, dst, indices, _pool_info, window_src, window);
}
const char *CpuPool2dKernel::name() const
{
- return "CpuPool2dKernel";
+ return _name.c_str();
}
} // namespace kernels
} // namespace cpu
diff --git a/src/core/cpu/kernels/CpuPool2dKernel.h b/src/core/cpu/kernels/CpuPool2dKernel.h
index 95298004e9..ff7d7bb21d 100644
--- a/src/core/cpu/kernels/CpuPool2dKernel.h
+++ b/src/core/cpu/kernels/CpuPool2dKernel.h
@@ -65,12 +65,17 @@ public:
const char *name() const override;
private:
+ using PoolingKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type;
+
+private:
PoolingLayerInfo _pool_info{};
DataLayout _data_layout{ DataLayout::UNKNOWN };
unsigned int _num_elems_processed_per_iteration{ 0 };
BorderSize _border_size{ 0 };
Size2D _pool_size{};
int _pool_stride_x{};
+ PoolingKernelPtr _run_method{ nullptr };
+ std::string _name{};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/core/cpu/kernels/CpuScaleKernel.cpp b/src/core/cpu/kernels/CpuScaleKernel.cpp
index a072dbd896..0c1f08ab79 100644
--- a/src/core/cpu/kernels/CpuScaleKernel.cpp
+++ b/src/core/cpu/kernels/CpuScaleKernel.cpp
@@ -67,32 +67,32 @@ static const ScaleKernel available_kernels[] =
{
#if defined(ARM_COMPUTE_ENABLE_SVE)
{
- "fp16_sve_scale",
+ "sve_fp16_scale",
[](const ScaleSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale)
},
{
- "f32_sve_scale",
+ "sve_fp32_scale",
[](const ScaleSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale)
},
{
- "qasymm8_sve_scale",
+ "sve_qu8_scale",
[](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve(); },
REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale)
},
{
- "qasymm8_signed_sve_scale",
+ "sve_qs8_scale",
[](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve(); },
REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale)
},
{
- "u8_sve_scale",
+ "sve_u8_scale",
[](const ScaleSelectorData & data) { return data.dt == DataType::U8 && data.ci.has_sve(); },
REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale)
},
{
- "s16_sve_scale",
+ "sve_s16_scale",
[](const ScaleSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); },
REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale)
},
@@ -100,33 +100,33 @@ static const ScaleKernel available_kernels[] =
#if defined(ARM_COMPUTE_ENABLE_NEON)
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
{
- "common_neon_scale",
+ "neon_fp16_scale",
[](const ScaleSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); },
REGISTER_FP16_NEON(arm_compute::cpu::common_neon_scale<float16_t>)
},
#endif /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
{
- "common_neon_scale",
+ "neon_fp32_scale",
[](const ScaleSelectorData & data) { return data.dt == DataType::F32; },
REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale<float>)
},
{
- "qasymm8_neon_scale",
+ "neon_qu8_scale",
[](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8; },
REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale)
},
{
- "qasymm8_signed_neon_scale",
+ "neon_qs8_scale",
[](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale)
},
{
- "common_neon_scale",
+ "neon_u8_scale",
[](const ScaleSelectorData & data) { return data.dt == DataType::U8; },
REGISTER_INTEGER_NEON(arm_compute::cpu::common_neon_scale<uint8_t>)
},
{
- "common_neon_scale",
+ "neon_s16_scale",
[](const ScaleSelectorData & data) { return data.dt == DataType::S16; },
REGISTER_INTEGER_NEON(arm_compute::cpu::common_neon_scale<int16_t>)
},
@@ -199,11 +199,6 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const I
}
} // namespace
-CpuScaleKernel::CpuScaleKernel()
- : _func(nullptr), _policy(), _border_mode(), _constant_border_value(PixelValue()), _sampling_offset(0), _align_corners(false), _data_layout(DataLayout::UNKNOWN)
-{
-}
-
void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets,
ITensorInfo *dst, const ScaleKernelInfo &info)
{
@@ -217,6 +212,12 @@ void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, co
dst,
info));
+ const auto *uk = get_implementation(ScaleSelectorData{ src->data_type(), CPUInfo::get() });
+ ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
+
+ _run_method = uk->ukernel;
+ _name = std::string("CpuScaleKernel").append("/").append(uk->name).append("_").append(string_from_interpolation_policy(info.interpolation_policy));
+
// Get data layout and width/height indices
_data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
@@ -595,6 +596,7 @@ void CpuScaleKernel::run_op(ITensorPack &tensors, const Window &window, const Th
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr && _data_layout == DataLayout::NCHW);
+ ARM_COMPUTE_ERROR_ON(_run_method == nullptr && _data_layout == DataLayout::NHWC);
const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
auto dst = tensors.get_tensor(TensorType::ACL_DST);
@@ -608,14 +610,13 @@ void CpuScaleKernel::run_op(ITensorPack &tensors, const Window &window, const Th
}
else
{
- const auto *uk = get_implementation(ScaleSelectorData{ src->info()->data_type(), CPUInfo::get() });
- uk->ukernel(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset, _align_corners, window);
+ _run_method(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset, _align_corners, window);
}
}
const char *CpuScaleKernel::name() const
{
- return "CpuScaleKernel";
+ return _name.c_str();
}
} // namespace kernels
} // namespace cpu
diff --git a/src/core/cpu/kernels/CpuScaleKernel.h b/src/core/cpu/kernels/CpuScaleKernel.h
index 24790d16d7..afaf074340 100644
--- a/src/core/cpu/kernels/CpuScaleKernel.h
+++ b/src/core/cpu/kernels/CpuScaleKernel.h
@@ -39,7 +39,7 @@ class CpuScaleKernel : public ICpuKernel
{
public:
/** Default constructor */
- CpuScaleKernel();
+ CpuScaleKernel() = default;
ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuScaleKernel);
/** Initialise the kernel's inputs, output and interpolation policy
*
@@ -55,17 +55,11 @@ public:
*/
void configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst,
const ScaleKernelInfo &info);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuScaleKernel
+ /** Static function to check if given info will lead to a valid configuration
*
- * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
- * @note Using @p policy Area only supports data layout NCHW and input data type U8.
+ * Similar to CpuScaleKernel::configure()
*
- * @param[in] src Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32.
- * @param[in] dx Distance x tensor info. Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
- * @param[in] dy Distance y tensor info. Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
- * @param[in] offsets Offset tensor info. Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
- * @param[in] dst Destination tensor info. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
- * @param[in] info @ref ScaleKernelInfo to use for validation
+ * @return a status
*/
static Status validate(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst,
const ScaleKernelInfo &info);
@@ -96,14 +90,18 @@ private:
/** Scale function to use for the particular function to use */
using ScaleFunctionPtr = void (CpuScaleKernel::*)(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const Window &window);
+ using ScaleKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *,
+ InterpolationPolicy, BorderMode, PixelValue, float, bool, const Window &)>::type;
- ScaleFunctionPtr _func;
- InterpolationPolicy _policy;
- BorderMode _border_mode;
- PixelValue _constant_border_value;
- float _sampling_offset;
- bool _align_corners;
- DataLayout _data_layout;
+ ScaleFunctionPtr _func{ nullptr };
+ InterpolationPolicy _policy{};
+ BorderMode _border_mode{};
+ PixelValue _constant_border_value{};
+ float _sampling_offset{ 0 };
+ bool _align_corners{ false };
+ DataLayout _data_layout{ DataLayout::UNKNOWN };
+ ScaleKernelPtr _run_method{ nullptr };
+ std::string _name{};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp b/src/core/cpu/kernels/CpuSoftmaxKernel.cpp
index 1e00e12050..c562699092 100644
--- a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp
+++ b/src/core/cpu/kernels/CpuSoftmaxKernel.cpp
@@ -72,12 +72,12 @@ static const SoftmaxLogits1DKernel available_logits_1d_kernels[] =
{
#if defined(ARM_COMPUTE_ENABLE_SVE)
{
- "sve_softmax_logits_1d_float",
+ "sve_fp32_softmax_logits_1d",
[](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32) && data.ci.has_sve(); },
REGISTER_FP32_SVE(arm_compute::cpu::sve_softmax_logits_1d_float<float>)
},
{
- "sve_softmax_logits_1d_float",
+ "sve_fp16_softmax_logits_1d",
[](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16) && data.ci.has_sve(); },
REGISTER_FP16_SVE(arm_compute::cpu::sve_softmax_logits_1d_float<float16_t>)
},
@@ -85,13 +85,13 @@ static const SoftmaxLogits1DKernel available_logits_1d_kernels[] =
#if defined(ARM_COMPUTE_ENABLE_NEON)
{
- "neon_softmax_logits_1d_float",
+ "neon_fp32_softmax_logits_1d",
[](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
REGISTER_FP32_NEON(arm_compute::cpu::neon_softmax_logits_1d_float<float>)
},
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
{
- "neon_softmax_logits_1d_float",
+ "neon_fp16_softmax_logits_1d",
[](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
REGISTER_FP16_NEON(arm_compute::cpu::neon_softmax_logits_1d_float<float16_t>)
},
@@ -100,23 +100,23 @@ static const SoftmaxLogits1DKernel available_logits_1d_kernels[] =
#if defined(ARM_COMPUTE_ENABLE_SVE2)
{
- "sve_softmax_logits_1d_quantized",
+ "sve2_qu8_softmax_logits_1d",
[](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8) && data.ci.has_sve2(); },
REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized<qasymm8_t>)
},
{
- "sve_softmax_logits_1d_quantized",
+ "sve2_qs8_softmax_logits_1d",
[](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve2(); },
REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized<qasymm8_signed_t>)
},
#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
{
- "neon_softmax_logits_1d_quantized",
+ "neon_qu8_softmax_logits_1d",
[](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized<qasymm8_t>)
},
{
- "neon_softmax_logits_1d_quantized",
+ "neon_qs8_softmax_logits_1d",
[](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized<qasymm8_signed_t>)
},
@@ -126,46 +126,46 @@ static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] =
{
#if defined(ARM_COMPUTE_ENABLE_SVE)
{
- "sve_logits_1d_max",
+ "sve_fp32_logits_1d_max",
[](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32) && data.ci.has_sve(); },
REGISTER_FP32_SVE(arm_compute::cpu::sve_logits_1d_max<float>)
},
{
- "sve_logits_1d_max",
+ "sve_fp16_logits_1d_max",
[](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16) && data.ci.has_sve(); },
REGISTER_FP16_SVE(arm_compute::cpu::sve_logits_1d_max<float16_t>)
},
{
- "sve_logits_1d_max",
+ "sve_qu8_logits_1d_max",
[](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8) && data.ci.has_sve(); },
REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_logits_1d_max<qasymm8_t>)
},
{
- "sve_logits_1d_max",
+ "sve_qs8_logits_1d_max",
[](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve(); },
REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_logits_1d_max<qasymm8_signed_t>)
},
#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
#if defined(ARM_COMPUTE_ENABLE_NEON)
{
- "neon_logits_1d_max",
+ "neon_fp32_logits_1d_max",
[](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
REGISTER_FP32_NEON(arm_compute::cpu::neon_logits_1d_max<float>)
},
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
{
- "neon_logits_1d_max",
+ "neon_fp16_logits_1d_max",
[](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
REGISTER_FP16_NEON(arm_compute::cpu::neon_logits_1d_max<float16_t>)
},
#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
{
- "neon_logits_1d_max",
+ "neon_qu8_logits_1d_max",
[](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_logits_1d_max<qasymm8_t>)
},
{
- "neon_logits_1d_max",
+ "neon_qs8_logits_1d_max",
[](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_logits_1d_max<qasymm8_signed_t>)
},
@@ -214,15 +214,9 @@ Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorI
} // namespace
-CpuLogits1DMaxKernel::CpuLogits1DMaxKernel()
-{
-}
-
void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
- // Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst));
// Softmax across the x dimension
@@ -230,8 +224,13 @@ void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
// Output auto initialization if not yet initialized
auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
- Window win = calculate_max_window(*src, Steps());
+ const auto *uk = get_implementation_logits_max(SoftmaxSelectorData{ src->data_type(), CPUInfo::get() });
+ ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
+
+ _run_method = uk->ukernel;
+ _name = std::string("CpuLogits1DMaxKernel").append("/").append(uk->name);
+ Window win = calculate_max_window(*src, Steps());
ICpuKernel::configure(win);
}
@@ -248,17 +247,17 @@ void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, co
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
auto dst = tensors.get_tensor(TensorType::ACL_DST);
- const auto *uk = get_implementation_logits_max(SoftmaxSelectorData{ src->info()->data_type(), CPUInfo::get() });
- uk->ukernel(src, dst, window);
+ _run_method(src, dst, window);
}
const char *CpuLogits1DMaxKernel::name() const
{
- return "CpuLogits1DMaxKernel";
+ return _name.c_str();
}
namespace
@@ -302,21 +301,11 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn
} // namespace
template <bool IS_LOG>
-CpuLogits1DSoftmaxKernel<IS_LOG>::CpuLogits1DSoftmaxKernel()
- : _beta(1.0f)
-{
-}
-
-template <bool IS_LOG>
void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
- // Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
- _beta = beta;
-
// Configure kernel window
const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
@@ -328,6 +317,15 @@ void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const I
const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
+ const auto *uk = get_implementation_logits(SoftmaxSelectorData{ src->data_type(), CPUInfo::get() });
+ ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
+
+ std::string kernel_name = IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel");
+
+ _beta = beta;
+ _run_method = uk->ukernel;
+ _name = kernel_name.append("/").append(uk->name);
+
// Configure kernel window
Window win = calculate_max_window(*max, Steps());
@@ -350,6 +348,7 @@ void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
auto max = tensors.get_tensor(TensorType::ACL_SRC_1);
@@ -362,22 +361,13 @@ void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window
ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread);
-
- const auto *uk = get_implementation_logits(SoftmaxSelectorData{ src->info()->data_type(), CPUInfo::get() });
- uk->ukernel(src, max, tmp_for_thread, dst, _beta, IS_LOG, window);
+ _run_method(src, max, tmp_for_thread, dst, _beta, IS_LOG, window);
}
template <bool IS_LOG>
const char *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const
{
- if(IS_LOG)
- {
- return "CpuLogits1DSoftmaxKernel";
- }
- else
- {
- return "CpuLogits1DLogSoftmaxKernel";
- }
+ return _name.c_str();
}
template class CpuLogits1DSoftmaxKernel<true>;
diff --git a/src/core/cpu/kernels/CpuSoftmaxKernel.h b/src/core/cpu/kernels/CpuSoftmaxKernel.h
index aa10467965..2912098c30 100644
--- a/src/core/cpu/kernels/CpuSoftmaxKernel.h
+++ b/src/core/cpu/kernels/CpuSoftmaxKernel.h
@@ -38,7 +38,7 @@ class CpuLogits1DMaxKernel : public ICpuKernel
{
public:
/** Constructor */
- CpuLogits1DMaxKernel();
+ CpuLogits1DMaxKernel() = default;
ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DMaxKernel);
/** Set the input and output tensors.
*
@@ -46,10 +46,9 @@ public:
* @param[out] dst Destination tensor info. Data types supported: same as @p input
*/
void configure(const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuLogits1DMaxKernel
+ /** Static function to check if given info will lead to a valid configuration
*
- * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] dst Destination tensor info. Data types supported: same as @p input
+ * Similar to CpuLogits1DMaxKernel::configure()
*
* @return a status
*/
@@ -58,6 +57,13 @@ public:
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
+
+private:
+ using SoftmaxLogits1DMaxKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &)>::type;
+
+private:
+ SoftmaxLogits1DMaxKernelPtr _run_method{ nullptr };
+ std::string _name{};
};
/** Interface for softmax computation for QASYMM8 with pre-computed max. */
@@ -66,7 +72,7 @@ class CpuLogits1DSoftmaxKernel : public ICpuKernel
{
public:
/** Default constructor */
- CpuLogits1DSoftmaxKernel();
+ CpuLogits1DSoftmaxKernel() = default;
ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DSoftmaxKernel);
/** Set the input and output tensors.
@@ -80,14 +86,9 @@ public:
* @param tmp Auxiliary tensor info. Must be type F32 and same shape as the input.
*/
void configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuLogits1DSoftmaxKernel
+ /** Static function to check if given info will lead to a valid configuration
*
- * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] max Max values tensor info. Same shape as input with dimension 0 set to 1.
- * Data types supported: same as @p input.
- * @param[in] dst Destination tensor info. Data types supported: same as @p input.
- * @param[in] beta A scaling factor for the exponent.
- * @param[in] tmp Tensor info of auxiliary. Must be type F32 and same shape as the input.
+ * Similar to CpuLogits1DSoftmaxKernel::configure()
*
* @return a status
*/
@@ -99,7 +100,12 @@ public:
const char *name() const override;
private:
- float _beta;
+ using SoftmaxLogits1DKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
+
+private:
+ float _beta{ 1.0f };
+ SoftmaxLogits1DKernelPtr _run_method{ nullptr };
+ std::string _name{};
};
} // namespace kernels
} // namespace cpu
diff --git a/src/core/cpu/kernels/CpuSubKernel.cpp b/src/core/cpu/kernels/CpuSubKernel.cpp
index d7057bbe2b..098a324377 100644
--- a/src/core/cpu/kernels/CpuSubKernel.cpp
+++ b/src/core/cpu/kernels/CpuSubKernel.cpp
@@ -59,59 +59,59 @@ struct SubKernel
static const SubKernel available_kernels[] =
{
{
- "sub_same_neon",
+ "neon_fp32_sub",
[](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); },
REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon<float>)
},
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
{
- "sub_same_neon",
+ "neon_fp16_sub",
[](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F16)); },
REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon<float16_t>)
},
#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
{
- "sub_same_neon",
+ "neon_u8_sub",
[](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::U8)); },
REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<uint8_t>)
},
{
- "sub_same_neon",
+ "neon_s16_sub",
[](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S16)); },
REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int16_t>)
},
{
- "sub_same_neon",
+ "neon_s32_sub",
[](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S32)); },
REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int32_t>)
},
{
- "sub_u8_s16_s16_neon",
+ "neon_u8_s16_s16_sub",
[](const SubSelectorData & data) { return ((data.dt1 == DataType::U8) && (data.dt2 == DataType::S16)); },
REGISTER_INTEGER_NEON(arm_compute::cpu::sub_u8_s16_s16_neon)
},
{
- "sub_s16_u8_s16_neon",
+ "neon_s16_u8_s16_sub",
[](const SubSelectorData & data) { return ((data.dt1 == DataType::S16) && (data.dt2 == DataType::U8)); },
REGISTER_INTEGER_NEON(arm_compute::cpu::sub_s16_u8_s16_neon)
},
{
- "sub_u8_u8_s16_neon",
+ "neon_u8_u8_s16_sub",
[](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); },
REGISTER_INTEGER_NEON(arm_compute::cpu::sub_u8_u8_s16_neon)
},
{
- "sub_qasymm8_neon",
+ "neon_qu8_sub",
[](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8)); },
REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon)
},
{
- "sub_qasymm8_signed_neon",
+ "neon_qs8_sub",
[](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8_SIGNED)); },
REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon)
},
{
- "sub_qsymm16_neon",
+ "neon_s16_sub",
[](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)); },
REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon)
},
@@ -206,7 +206,12 @@ void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I
// Auto initialize dst if not initialized
set_shape_if_empty(*dst, out_shape);
- _policy = policy;
+ const auto *uk = get_implementation(src0->data_type(), src1->data_type(), dst->data_type());
+ ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
+
+ _policy = policy;
+ _run_method = uk->ukernel;
+ _name = std::string("CpuSubKernel").append("/").append(uk->name);
// CpuSubKernel doesn't need padding so update_window_and_padding() can be skipped
Window win = calculate_max_window(out_shape, Steps());
@@ -227,19 +232,18 @@ void CpuSubKernel::run_op(ITensorPack &tensors, const Window &window, const Thre
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
- // Dispatch kernel
- const auto *uk = get_implementation(src0->info()->data_type(), src1->info()->data_type(), dst->info()->data_type());
- uk->ukernel(src0, src1, dst, _policy, window);
+ _run_method(src0, src1, dst, _policy, window);
}
const char *CpuSubKernel::name() const
{
- return "CpuSubKernel";
+ return _name.c_str();
}
} // namespace kernels
} // namespace cpu
diff --git a/src/core/cpu/kernels/CpuSubKernel.h b/src/core/cpu/kernels/CpuSubKernel.h
index da114b6e08..b9160bd150 100644
--- a/src/core/cpu/kernels/CpuSubKernel.h
+++ b/src/core/cpu/kernels/CpuSubKernel.h
@@ -61,25 +61,9 @@ public:
* @param[in] policy Overflow policy. Convert policy cannot be WRAP if datatype is quantized.
*/
void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuSubKernel
+ /** Static function to check if given info will lead to a valid configuration
*
- * Valid configurations (src0,src1) -> dst :
- *
- * - (U8,U8) -> U8
- * - (U8,U8) -> S16
- * - (QASYMM8, QASYMM8) -> QASYMM8
- * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
- * - (S16,U8) -> S16
- * - (U8,S16) -> S16
- * - (S16,S16) -> S16
- * - (S32,S32) -> S32
- * - (F16,F16) -> F16
- * - (F32,F32) -> F32
- *
- * @param[in] src0 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
- * @param[in] src1 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
- * @param[in] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32.
- * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
+ * Similar to CpuSubKernel::configure()
*
* @return a status
*/
@@ -90,7 +74,12 @@ public:
const char *name() const override;
private:
+ using SubKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
+
+private:
ConvertPolicy _policy{};
+ SubKernelPtr _run_method{ nullptr };
+ std::string _name{};
};
} // namespace kernels
} // namespace cpu