From 5fdde99f4271891a40c02cd1e89f1344aa84583a Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 25 Jun 2021 05:42:57 +0100 Subject: Improve selection speed of CPU implementations CPU micro-kernel to be used was picked during kernel execution. Move selection during configuration to reduce runtime overhead. Standardize kernel names as follows: ___ e.g. sve_fp32_nhwc_scale Signed-off-by: Georgios Pinitas Change-Id: I544f1c08c8fef0f130a3bde61882ccb9a1f47f21 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5855 Reviewed-by: Michele Di Giorgio Tested-by: Arm Jenkins --- src/core/cpu/kernels/CpuActivationKernel.cpp | 35 +++++---- src/core/cpu/kernels/CpuActivationKernel.h | 12 +-- src/core/cpu/kernels/CpuAddKernel.cpp | 59 ++++++++------- src/core/cpu/kernels/CpuAddKernel.h | 12 +-- src/core/cpu/kernels/CpuElementwiseKernel.cpp | 103 ++++++++++++++------------ src/core/cpu/kernels/CpuElementwiseKernel.h | 62 +++++++--------- src/core/cpu/kernels/CpuFloorKernel.cpp | 26 ++++--- src/core/cpu/kernels/CpuFloorKernel.h | 12 ++- src/core/cpu/kernels/CpuPool2dKernel.cpp | 48 ++++++------ src/core/cpu/kernels/CpuPool2dKernel.h | 5 ++ src/core/cpu/kernels/CpuScaleKernel.cpp | 41 +++++----- src/core/cpu/kernels/CpuScaleKernel.h | 32 ++++---- src/core/cpu/kernels/CpuSoftmaxKernel.cpp | 84 +++++++++------------ src/core/cpu/kernels/CpuSoftmaxKernel.h | 32 ++++---- src/core/cpu/kernels/CpuSubKernel.cpp | 36 +++++---- src/core/cpu/kernels/CpuSubKernel.h | 25 ++----- 16 files changed, 315 insertions(+), 309 deletions(-) (limited to 'src/core/cpu/kernels') diff --git a/src/core/cpu/kernels/CpuActivationKernel.cpp b/src/core/cpu/kernels/CpuActivationKernel.cpp index 24642f1efb..dad2ecfc5b 100644 --- a/src/core/cpu/kernels/CpuActivationKernel.cpp +++ b/src/core/cpu/kernels/CpuActivationKernel.cpp @@ -63,57 +63,57 @@ static const ActivationKernel available_kernels[] = { #if defined(ARM_COMPUTE_ENABLE_SVE) { - "fp16_sve_activation", + "sve_fp16_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_activation) }, { - "fp32_sve_activation", + "sve_fp32_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_activation) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_NEON) { - "fp16_neon_activation", + "neon_fp16_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::F16; }, REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_activation) }, { - "fp32_neon_activation", + "neon_fp32_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::F32; }, REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_activation) }, #endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ #if defined(ARM_COMPUTE_ENABLE_SVE2) { - "qasymm8_sve_activation", + "sve_qu8_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); }, REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_activation) }, { - "qasymm8_signed_sve_activation", + "sve_qs8_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); }, REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_activation) }, { - "qsymm16_sve_activation", + "sve_qs16_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16 && data.ci.has_sve2(); }, REGISTER_QSYMM16_SVE(arm_compute::cpu::qsymm16_sve_activation) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ { - "qasymm8_neon_activation", + "neon_qu8_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; }, REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_activation) }, { - "qasymm8_signed_neon_activation", + "neon_qs8_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_activation) }, { - "qsymm16_neon_activation", + "neon_qs16_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; }, REGISTER_QSYMM16_NEON(arm_compute::cpu::qsymm16_neon_activation) }, @@ -206,10 +206,14 @@ std::pair validate_and_configure_window(const ITensorInfo *src, void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info)); - _act_info = activation_info; + const auto uk = get_implementation(ActivationSelectorData{ src->data_type(), CPUInfo::get() }); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info)); + _act_info = activation_info; + _run_method = uk->ukernel; + _name = std::string("CpuActivationKernel").append("/").append(uk->name); // Configure kernel window auto win_config = validate_and_configure_window(src, dst); @@ -239,18 +243,17 @@ void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, con ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); ARM_COMPUTE_ERROR_ON(tensors.empty()); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); - const auto *uk = get_implementation(ActivationSelectorData{ src->info()->data_type(), CPUInfo::get() }); - - uk->ukernel(src, dst, _act_info, window); + _run_method(src, dst, _act_info, window); } const char *CpuActivationKernel::name() const { - return "CpuActivationKernel"; + return _name.c_str(); } } // namespace kernels } // namespace cpu diff --git a/src/core/cpu/kernels/CpuActivationKernel.h b/src/core/cpu/kernels/CpuActivationKernel.h index de71014303..37650345fe 100644 --- a/src/core/cpu/kernels/CpuActivationKernel.h +++ b/src/core/cpu/kernels/CpuActivationKernel.h @@ -49,12 +49,9 @@ public: * @param[in] activation_info Activation layer information. */ void configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info); - /** Static function to check if given info will lead to a valid configuration of @ref CpuActivationKernel + /** Static function to check if given info will lead to a valid configuration * - * @param[in] src Source tensor info. In case of @p dst tensor info = nullptr, this tensor will store the result - * of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. - * @param[in] dst Destination tensor info. Data type supported: same as @p src - * @param[in] act_info Activation layer information. + * Similar to CpuActivationKernel::configure() * * @return a status */ @@ -64,8 +61,13 @@ public: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; +private: + using ActivationKernelPtr = std::add_pointer::type; + private: ActivationLayerInfo _act_info{}; + ActivationKernelPtr _run_method{ nullptr }; + std::string _name{}; }; } // namespace kernels } // namespace cpu diff --git a/src/core/cpu/kernels/CpuAddKernel.cpp b/src/core/cpu/kernels/CpuAddKernel.cpp index 8d74b4027b..12766037a7 100644 --- a/src/core/cpu/kernels/CpuAddKernel.cpp +++ b/src/core/cpu/kernels/CpuAddKernel.cpp @@ -69,7 +69,7 @@ static const AddKernel available_kernels[] = { #if defined(ARM_COMPUTE_ENABLE_SVE2) { - "add_qasymm8_sve", + "sve2_qu8_add", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8)) && data.ci.has_sve(); @@ -77,7 +77,7 @@ static const AddKernel available_kernels[] = REGISTER_QASYMM8_SVE(arm_compute::cpu::add_qasymm8_sve) }, { - "add_qasymm8_signed_sve", + "sve2_qs8_add", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8_SIGNED)) && data.ci.has_sve(); @@ -85,7 +85,7 @@ static const AddKernel available_kernels[] = REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::add_qasymm8_signed_sve) }, { - "add_qsymm16_sve", + "sve2_qs16_add", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)) && data.ci.has_sve(); @@ -95,7 +95,7 @@ static const AddKernel available_kernels[] = #endif /* !defined(ARM_COMPUTE_ENABLE_SVE2) */ #if defined(ARM_COMPUTE_ENABLE_SVE) { - "add_same_sve", + "sve_fp32_add", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)) && data.ci.has_sve(); @@ -103,7 +103,7 @@ static const AddKernel available_kernels[] = REGISTER_FP32_SVE(arm_compute::cpu::add_same_sve) }, { - "add_same_sve", + "sve_fp16_add", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F16)) && data.ci.has_sve(); @@ -111,7 +111,7 @@ static const AddKernel available_kernels[] = REGISTER_FP16_SVE(arm_compute::cpu::add_same_sve) }, { - "add_same_sve", + "sve_u8_add", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::U8)) && data.ci.has_sve(); @@ -119,7 +119,7 @@ static const AddKernel available_kernels[] = REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve) }, { - "add_same_sve", + "sve_s16_add", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S16)) && data.ci.has_sve(); @@ -127,7 +127,7 @@ static const AddKernel available_kernels[] = REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve) }, { - "add_same_sve", + "sve_s32_add", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S32)) && data.ci.has_sve(); @@ -135,7 +135,7 @@ static const AddKernel available_kernels[] = REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve) }, { - "add_u8_s16_s16_sve", + "sve_u8_s16_s16_add", [](const AddSelectorData & data) { return ((data.dt1 == DataType::U8) && (data.dt2 == DataType::S16)) && data.ci.has_sve(); @@ -143,7 +143,7 @@ static const AddKernel available_kernels[] = REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_s16_s16_sve) }, { - "add_s16_u8_s16_sve", + "sve_s16_u8_s16_add", [](const AddSelectorData & data) { return ((data.dt1 == DataType::S16) && (data.dt2 == DataType::U8)) && data.ci.has_sve(); @@ -151,7 +151,7 @@ static const AddKernel available_kernels[] = REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_u8_s16_sve) }, { - "add_u8_u8_s16_sve", + "sve_u8_u8_s16_add", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)) && data.ci.has_sve(); @@ -161,13 +161,13 @@ static const AddKernel available_kernels[] = #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_NEON) { - "add_same_neon", + "neon_fp32_add", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); }, REGISTER_FP32_NEON(arm_compute::cpu::add_same_neon) }, #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { - "add_same_neon", + "neon_fp16_add", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F16)) && data.ci.has_fp16(); @@ -176,49 +176,49 @@ static const AddKernel available_kernels[] = }, #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ { - "add_same_neon", + "neon_u8_add", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::U8)); }, REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon) }, { - "add_same_neon", + "neon_s16_add", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S16)); }, REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon) }, { - "add_same_neon", + "neon_s32_add", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S32)); }, REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon) }, { - "add_u8_s16_s16_neon", + "neon_u8_s16_s16_add", [](const AddSelectorData & data) { return ((data.dt1 == DataType::U8) && (data.dt2 == DataType::S16)); }, REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_s16_s16_neon) }, { - "add_s16_u8_s16_neon", + "neon_s16_u8_s16_add", [](const AddSelectorData & data) { return ((data.dt1 == DataType::S16) && (data.dt2 == DataType::U8)); }, REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_u8_s16_neon) }, { - "add_u8_u8_s16_neon", + "neon_u8_u8_s16_add", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); }, REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_u8_s16_neon) }, #endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ #if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) { - "add_qasymm8_neon", + "neon_qu8_add", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8)); }, REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon) }, { - "add_qasymm8_signed_neon", + "neon_qs8_add", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8_SIGNED)); }, REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon) }, { - "add_qsymm16_neon", + "neon_qs16_add", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)); }, REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon) }, @@ -339,7 +339,12 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy)); - _policy = policy; + const auto uk = get_implementation(CPUInfo::get(), src0->data_type(), src1->data_type(), dst->data_type()); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + + _policy = policy; + _run_method = uk->ukernel; + _name = std::string("CpuAddKernel").append("/").append(uk->name); // Configure kernel window auto win_config = validate_and_configure_window(*src0, *src1, *dst); @@ -364,20 +369,18 @@ void CpuAddKernel::run_op(ITensorPack &tensors, const Window &window, const Thre ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); ARM_COMPUTE_ERROR_ON(tensors.empty()); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0); const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); - const auto *uk = get_implementation(CPUInfo::get(), src0->info()->data_type(), src1->info()->data_type(), dst->info()->data_type()); - ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - - uk->ukernel(src0, src1, dst, _policy, window); + _run_method(src0, src1, dst, _policy, window); } const char *CpuAddKernel::name() const { - return "CpuAddKernel"; + return _name.c_str(); } } // namespace kernels } // namespace cpu diff --git a/src/core/cpu/kernels/CpuAddKernel.h b/src/core/cpu/kernels/CpuAddKernel.h index a36ec7ad65..3ebaa462ee 100644 --- a/src/core/cpu/kernels/CpuAddKernel.h +++ b/src/core/cpu/kernels/CpuAddKernel.h @@ -61,12 +61,9 @@ public: * @param[in] policy Overflow policy. */ void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy); - /** Static function to check if given info will lead to a valid configuration of @ref CpuAddKernel + /** Static function to check if given info will lead to a valid configuration * - * @param[in] src0 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 - * @param[in] src1 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 - * @param[in] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. - * @param[in] policy Overflow policy. + * Similar to CpuAddKernel::configure() * * @return a status */ @@ -76,8 +73,13 @@ public: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; +private: + using AddKernelPtr = std::add_pointer::type; + private: ConvertPolicy _policy{}; + AddKernelPtr _run_method{ nullptr }; + std::string _name{}; }; } // namespace kernels } // namespace cpu diff --git a/src/core/cpu/kernels/CpuElementwiseKernel.cpp b/src/core/cpu/kernels/CpuElementwiseKernel.cpp index dc0c5b210d..dc574fce65 100644 --- a/src/core/cpu/kernels/CpuElementwiseKernel.cpp +++ b/src/core/cpu/kernels/CpuElementwiseKernel.cpp @@ -58,69 +58,68 @@ struct ElementwiseKernel UKernelType *ukernel; }; -template -std::function -configure_arithm_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +template +CpuElementwiseKernel::UKernelInfo configure_arithm_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) { ARM_COMPUTE_UNUSED(src1, dst); static ElementwiseKernel kernels[] = { #if defined(ARM_COMPUTE_ENABLE_SVE) { - "sve_elementwise_fp32", + "sve_fp32_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, REGISTER_FP32_SVE((arm_compute::cpu::elementwise_arithmetic_op)) }, { - "sve_elementwise_s32", + "sve_s32_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32 && data.ci.has_sve(); }, REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op)) }, { - "sve_elementwise_s16", + "sve_s16_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); }, REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op)) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_NEON) { - "neon_elementwise_f32", + "neon_fp32_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32; }, REGISTER_FP32_NEON((arm_compute::cpu::elementwise_arithm_op>)) }, { - "neon_elementwise_s32", + "neon_s32_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32; }, REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op>)) }, #endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ #if defined(ARM_COMPUTE_ENABLE_SVE2) { - "sve2_elementwise_qu8", + "sve2_qu8_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); }, REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op)) }, { - "sve2_elementwise_qs8", + "sve2_qs8_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); }, REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op)) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ #if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) { - "neon_elementwise_qu8", + "neon_qu8_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8; }, REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_arithm_op_quantized)) }, { - "neon_elementwise_qs8", + "neon_qs8_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_arithm_op_quantized_signed)) }, #endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_SVE) { - "sve_elementwise_f16", + "sve_fp16_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, REGISTER_FP16_SVE((arm_compute::cpu::elementwise_arithmetic_op)) }, @@ -128,13 +127,13 @@ configure_arithm_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorI #if defined(ARM_COMPUTE_ENABLE_NEON) #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { - "neon_elementwise_f16", + "neon_fp16_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); }, REGISTER_FP16_NEON((arm_compute::cpu::elementwise_arithm_op>)) }, #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ { - "neon_elementwise_s16", + "neon_s16_elementwise", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; }, REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op>)) }, @@ -145,98 +144,97 @@ configure_arithm_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorI { if(uk.is_selected({ src0->data_type(), CPUInfo::get() })) { - return uk.ukernel; + return { uk.name, uk.ukernel }; } } - return nullptr; + return { "", nullptr }; } -template -std::function -configure_comp_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +template +CpuElementwiseKernel::UKernelInfo configure_comp_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) { ARM_COMPUTE_UNUSED(src1, dst); static ElementwiseKernel kernels[] = { #if defined(ARM_COMPUTE_ENABLE_SVE) { - "sve_comparison_u8", + "sve_u8_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::U8 && data.ci.has_sve(); }, REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op)) }, { - "sve_comparison_f32", + "sve_fp32_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, REGISTER_FP32_SVE((arm_compute::cpu::elementwise_comparison_op)) }, { - "sve_comparison_s16", + "sve_s16_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); }, REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op)) }, { - "sve_comparison_s32", + "sve_s32_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32 && data.ci.has_sve(); }, REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op)) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_NEON) { - "neon_comparison_u8", + "neon_u8_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::U8; }, REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_8)) }, { - "neon_comparison_f32", + "neon_fp32_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32; }, REGISTER_FP32_NEON((arm_compute::cpu::elementwise_comp_op_32)) }, { - "neon_comparison_s16", + "neon_s16_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; }, REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_16)) }, { - "neon_comparison_s32", + "neon_s32_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32; }, REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_32)) }, #endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ #if defined(ARM_COMPUTE_ENABLE_SVE2) { - "sve_comparison_qu8", + "sve2_qu8_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); }, REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_comparison_quantized_op)) }, { - "sve_comparison_qs8", + "sve2_qs8_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); }, REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_comparison_quantized_op)) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ #if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) { - "neon_comparison_qu8", + "neon_qu8_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8; }, REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_comp_op_quantized)) }, { - "neon_comparison_qs8", + "neon_qs8_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_comp_op_quantized_signed)) }, #endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_SVE) { - "sve_comparison_f16", + "sve_fp16_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, REGISTER_FP16_SVE((arm_compute::cpu::elementwise_comparison_op)) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { - "neon_comparison_f16", + "neon_fp16_comparison", [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); }, REGISTER_FP16_NEON((arm_compute::cpu::elementwise_comp_op_16)) }, @@ -247,11 +245,11 @@ configure_comp_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInf { if(uk.is_selected({ src0->data_type(), CPUInfo::get() })) { - return uk.ukernel; + return { uk.name, uk.ukernel }; } } - return nullptr; + return { "", nullptr }; } } // namespace @@ -278,6 +276,11 @@ void CpuElementwiseKernel::configure_common(const ITensorInfo *src0, const ITens { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + const auto uk = get_implementation(src0, src1, dst); + + _run_method = uk.ukernel; + _name = std::string("CpuElementwiseKernel").append("/").append(uk.name); + // If any of shapes is dynamic, expect a configured window and dst at run-time. if(src0->is_dynamic() || src1->is_dynamic()) { @@ -292,22 +295,26 @@ void CpuElementwiseKernel::configure_common(const ITensorInfo *src0, const ITens void CpuElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0); auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); auto dst = tensors.get_tensor(TensorType::ACL_DST); - auto function = get_implementation(src0->info(), src1->info(), dst->info()); - ARM_COMPUTE_ERROR_ON(function == nullptr); - function(src0, src1, dst, window); + _run_method(src0, src1, dst, window); +} + +const char *CpuElementwiseKernel::name() const +{ + return _name.c_str(); } /** Arithmetic operators (min, max, squared_diff) */ void CpuArithmeticKernel::configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) { ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst)); - configure_common(src0, src1, dst); _op = op; + configure_common(src0, src1, dst); } Status CpuArithmeticKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) @@ -329,8 +336,7 @@ Status CpuArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo * return Status{}; } -std::function -CpuArithmeticKernel::get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +CpuElementwiseKernel::UKernelInfo CpuArithmeticKernel::get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) { switch(_op) { @@ -349,7 +355,7 @@ CpuArithmeticKernel::get_implementation(const ITensorInfo *src0, const ITensorIn default: ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); } - return nullptr; + return { "", nullptr }; } /** The division operator */ @@ -357,8 +363,8 @@ CpuArithmeticKernel::get_implementation(const ITensorInfo *src0, const ITensorIn void CpuDivisionKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) { ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst)); - configure_common(src0, src1, dst); _op = ArithmeticOperation::DIV; + configure_common(src0, src1, dst); } Status CpuDivisionKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) @@ -378,8 +384,8 @@ Status CpuDivisionKernel::validate(const ITensorInfo *src0, const ITensorInfo *s void CpuPowerKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) { ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst)); - configure_common(src0, src1, dst); _op = ArithmeticOperation::POWER; + configure_common(src0, src1, dst); } Status CpuPowerKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) @@ -399,8 +405,8 @@ Status CpuPowerKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1 void CpuComparisonKernel::configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) { ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst)); - configure_common(src0, src1, dst); _op = op; + configure_common(src0, src1, dst); } Status CpuComparisonKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) @@ -422,8 +428,7 @@ Status CpuComparisonKernel::validate(ComparisonOperation op, const ITensorInfo * return Status{}; } -std::function -CpuComparisonKernel::get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +CpuElementwiseKernel::UKernelInfo CpuComparisonKernel::get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) { switch(_op) { @@ -442,7 +447,7 @@ CpuComparisonKernel::get_implementation(const ITensorInfo *src0, const ITensorIn default: ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); } - return nullptr; + return { "", nullptr }; } } // namespace kernels } // namespace cpu diff --git a/src/core/cpu/kernels/CpuElementwiseKernel.h b/src/core/cpu/kernels/CpuElementwiseKernel.h index 952c6e3e25..50c8d29ac5 100644 --- a/src/core/cpu/kernels/CpuElementwiseKernel.h +++ b/src/core/cpu/kernels/CpuElementwiseKernel.h @@ -43,25 +43,19 @@ namespace kernels class CpuElementwiseKernel : public ICpuKernel { public: - const char *name() const override - { - return "CpuElementwiseKernel"; - } - CpuElementwiseKernel() = default; ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuElementwiseKernel); - /** Common signature for all the specialised arithmetic functions - * - * @param[in] src0 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32. - * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0. - * @param[out] dst Output tensor info. Data types supported: Dependent on subclass. - * @param[in] window Region on which to execute the kernel. - */ using ElementwiseFunction = void(const ITensor *, const ITensor *, ITensor *, const Window &); + struct UKernelInfo + { + std::string name; + std::function ukernel; + }; // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; protected: /** Validate the argument passed to the kernel @@ -85,7 +79,11 @@ protected: * * @return the function instance for the micro kernel */ - virtual std::function get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) = 0; + virtual UKernelInfo get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) = 0; + +protected: + std::function _run_method{ nullptr }; + std::string _name{}; }; class CpuArithmeticKernel : public CpuElementwiseKernel @@ -103,14 +101,11 @@ public: */ void configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel + /** Static function to check if given info will lead to a valid configuration * - * @param[in] op Arithmetic operation to be executed. - * @param[in] src0 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32. - * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0. - * @param[in] dst Output tensor info. Data types supported: Same as @p src0. + * Similar to CpuArithmeticKernel::configure() * - * @return a Status + * @return a status */ static Status validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); @@ -129,7 +124,7 @@ private: * * @return the function instance for the micro kernel */ - std::function get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) override; + UKernelInfo get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) override; }; class CpuDivisionKernel : public CpuArithmeticKernel @@ -146,13 +141,11 @@ public: */ void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref CpuDivisionKernel + /** Static function to check if given info will lead to a valid configuration * - * @param[in] src0 First tensor input info. Data types supported: S32/F16/F32. - * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0. - * @param[in] dst Output tensor info. Data types supported: Same as @p src0. + * Similar to CpuDivisionKernel::configure() * - * @return a Status + * @return a status */ static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); @@ -175,13 +168,11 @@ public: */ void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref CpuPowerKernel + /** Static function to check if given info will lead to a valid configuration * - * @param[in] src0 First tensor input info. Data types supported: F16/F32. - * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0. - * @param[in] dst Output tensor info. Data types supported: Same as @p src0. + * Similar to CpuPowerKernel::configure() * - * @return a Status + * @return a status */ static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); @@ -205,14 +196,11 @@ public: */ void configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuComparisonKernel + /** Static function to check if given info will lead to a valid configuration * - * @param[in] op Comparison operation to be executed. - * @param[in] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. - * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0. - * @param[in] dst Output tensor info. Data types supported: U8. + * Similar to CpuComparisonKernel::configure() * - * @return a Status + * @return a status */ static Status validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); @@ -229,7 +217,7 @@ private: * * @return the function instance for the micro kernel */ - std::function get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) override; + UKernelInfo get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) override; ComparisonOperation _op{}; }; diff --git a/src/core/cpu/kernels/CpuFloorKernel.cpp b/src/core/cpu/kernels/CpuFloorKernel.cpp index c2e9d48ce9..d41df6a1f5 100644 --- a/src/core/cpu/kernels/CpuFloorKernel.cpp +++ b/src/core/cpu/kernels/CpuFloorKernel.cpp @@ -54,18 +54,18 @@ struct FloorUKernel { const char *name; const FloorSelectorPtr is_selected; - FloorUKernelPtr func; + FloorUKernelPtr ukernel; }; static const FloorUKernel available_kernels[] = { { - "fp16_neon_floor", + "neon_fp16_floor", [](const FloorSelectorData & data) { return data.dt == DataType::F16; }, REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor) }, { - "f32_neon_floor", + "neon_fp32_floor", [](const FloorSelectorData & data) { return data.dt == DataType::F32; }, REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor) }, @@ -94,7 +94,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); const auto *uk = get_implementation(FloorSelectorData{ src->data_type() }); - ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->func == nullptr); + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); // Validate in case of configured output if(dst->total_size() > 0) @@ -110,12 +110,15 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) void CpuFloorKernel::configure(const ITensorInfo *src, ITensorInfo *dst) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); - // Auto initialize output auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type()); - // Validate - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); + const auto *uk = get_implementation(FloorSelectorData{ src->data_type() }); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + + _run_method = uk->ukernel; + _name = std::string("CpuFloorKernel").append("/").append(uk->name); // Configure kernel window const Window win = calculate_max_window(*src, Steps()); @@ -146,12 +149,11 @@ void CpuFloorKernel::run_op(ITensorPack &tensors, const Window &window, const Th ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); ARM_COMPUTE_ERROR_ON(tensors.empty()); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); - - const auto len = static_cast(window.x().end()) - static_cast(window.x().start()); - const auto *ukernel = get_implementation(FloorSelectorData{ src->info()->data_type() }); + const auto len = static_cast(window.x().end()) - static_cast(window.x().start()); Window win{ window }; win.set(Window::DimX, Window::Dimension(0, 1, 1)); @@ -161,14 +163,14 @@ void CpuFloorKernel::run_op(ITensorPack &tensors, const Window &window, const Th execute_window_loop(win, [&](const Coordinates &) { - ukernel->func(src_it.ptr(), dst_it.ptr(), len); + _run_method(src_it.ptr(), dst_it.ptr(), len); }, src_it, dst_it); } const char *CpuFloorKernel::name() const { - return "CpuFloorKernel"; + return _name.c_str(); } } // namespace kernels } // namespace cpu diff --git a/src/core/cpu/kernels/CpuFloorKernel.h b/src/core/cpu/kernels/CpuFloorKernel.h index 2680871b45..78534d2a1d 100644 --- a/src/core/cpu/kernels/CpuFloorKernel.h +++ b/src/core/cpu/kernels/CpuFloorKernel.h @@ -45,10 +45,9 @@ public: * @param[out] dst Destination tensor. Same as @p src */ void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref CpuFloorKernel + /** Static function to check if given info will lead to a valid configuration * - * @param[in] src Source tensor info. Data type supported: F16/F32. - * @param[in] dst Destination tensor info. Same as @p src + * Similar to CpuFloorKernel::configure() * * @return a status */ @@ -65,6 +64,13 @@ public: // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; + +private: + using FloorUKernelPtr = std::add_pointer::type; + +private: + FloorUKernelPtr _run_method{ nullptr }; + std::string _name{}; }; } // namespace kernels } // namespace cpu diff --git a/src/core/cpu/kernels/CpuPool2dKernel.cpp b/src/core/cpu/kernels/CpuPool2dKernel.cpp index bfde2dfa80..27f4b950db 100644 --- a/src/core/cpu/kernels/CpuPool2dKernel.cpp +++ b/src/core/cpu/kernels/CpuPool2dKernel.cpp @@ -72,92 +72,92 @@ struct PoolingKernel static const PoolingKernel available_kernels[] = { { - "poolingMxN_qasymm8_neon_nhwc", + "neon_qu8_nhwc_poolMxN", [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); }, REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc) }, { - "poolingMxN_qasymm8_signed_neon_nhwc", + "neon_qs8_nhwc_poolMxN", [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); }, REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc) }, #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { - "poolingMxN_fp16_neon_nhwc", + "neon_f16_nhwc_poolMxN", [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)); }, REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc) }, #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ { - "poolingMxN_fp32_neon_nhwc", + "neon_fp32_nhwc_poolMxN", [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); }, REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc) }, #if defined(ENABLE_NCHW_KERNELS) { - "pooling2_qasymm8_neon_nchw", + "neon_qu8_nchw_pool2", [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); }, REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw) }, { - "pooling3_qasymm8_neon_nchw", + "neon_qu8_nchw_pool3", [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); }, REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw) }, { - "poolingMxN_qasymm8_neon_nchw", + "neon_qu8_nchw_poolMxN", [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); }, REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw) }, { - "pooling2_qasymm8_signed_neon_nchw", + "neon_qs8_nchw_pool2", [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); }, REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw) }, { - "pooling3_qasymm8_signed_neon_nchw", + "neon_qs8_nchw_pool3", [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); }, REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw) }, { - "poolingMxN_qasymm8_signed_neon_nchw", + "neon_qs8_nchw_poolMxN", [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); }, REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw) }, #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { - "pooling2_fp16_neon_nchw", + "neon_fp16_nchw_pool2", [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); }, REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw) }, { - "pooling3_fp16_neon_nchw", + "neon_fp16_nchw_pool3", [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); }, REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw) }, { - "poolingMxN_fp16_neon_nchw", + "neon_fp16_nchw_poolMxN", [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16)); }, REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw) }, #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ { - "pooling2_fp32_neon_nchw", + "neon_fp32_nchw_pool2", [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); }, REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw) }, { - "pooling3_fp32_neon_nchw", + "neon_fp32_nchw_pool3", [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); }, REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw) }, { - "pooling7_fp32_neon_nchw", + "neon_fp32_nchw_pool7", [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7)); }, REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw) }, { - "poolingMxN_fp32_neon_nchw", + "neon_fp32_nchw_poolMxN", [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); }, REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw) }, @@ -398,11 +398,16 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices, pool_size)); + const auto *uk = get_implementation(src->data_type(), src->data_layout(), pad_stride_info.stride().first, pool_size); + ARM_COMPUTE_ERROR_ON(uk == nullptr); + // Set instance variables _pool_info = pool_info; _data_layout = src->data_layout(); _pool_size = pool_size; _pool_stride_x = pad_stride_info.stride().first; + _run_method = uk->ukernel; + _name = std::string("CpuPool2dKernel").append("/").append(uk->name); if(_data_layout == DataLayout::NHWC) { @@ -451,6 +456,7 @@ void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const T ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC_0); ITensor *dst = tensors.get_tensor(TensorType::ACL_DST_0); @@ -498,16 +504,12 @@ void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const T window_src.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x)); window_src.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y)); } - - const auto *uk = get_implementation(src->info()->data_type(), _data_layout, _pool_stride_x, _pool_size); - ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - - uk->ukernel(src, dst, indices, _pool_info, window_src, window); + _run_method(src, dst, indices, _pool_info, window_src, window); } const char *CpuPool2dKernel::name() const { - return "CpuPool2dKernel"; + return _name.c_str(); } } // namespace kernels } // namespace cpu diff --git a/src/core/cpu/kernels/CpuPool2dKernel.h b/src/core/cpu/kernels/CpuPool2dKernel.h index 95298004e9..ff7d7bb21d 100644 --- a/src/core/cpu/kernels/CpuPool2dKernel.h +++ b/src/core/cpu/kernels/CpuPool2dKernel.h @@ -64,6 +64,9 @@ public: BorderSize border_size() const override; const char *name() const override; +private: + using PoolingKernelPtr = std::add_pointer::type; + private: PoolingLayerInfo _pool_info{}; DataLayout _data_layout{ DataLayout::UNKNOWN }; @@ -71,6 +74,8 @@ private: BorderSize _border_size{ 0 }; Size2D _pool_size{}; int _pool_stride_x{}; + PoolingKernelPtr _run_method{ nullptr }; + std::string _name{}; }; } // namespace kernels } // namespace cpu diff --git a/src/core/cpu/kernels/CpuScaleKernel.cpp b/src/core/cpu/kernels/CpuScaleKernel.cpp index a072dbd896..0c1f08ab79 100644 --- a/src/core/cpu/kernels/CpuScaleKernel.cpp +++ b/src/core/cpu/kernels/CpuScaleKernel.cpp @@ -67,32 +67,32 @@ static const ScaleKernel available_kernels[] = { #if defined(ARM_COMPUTE_ENABLE_SVE) { - "fp16_sve_scale", + "sve_fp16_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale) }, { - "f32_sve_scale", + "sve_fp32_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale) }, { - "qasymm8_sve_scale", + "sve_qu8_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve(); }, REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale) }, { - "qasymm8_signed_sve_scale", + "sve_qs8_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve(); }, REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale) }, { - "u8_sve_scale", + "sve_u8_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::U8 && data.ci.has_sve(); }, REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale) }, { - "s16_sve_scale", + "sve_s16_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); }, REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale) }, @@ -100,33 +100,33 @@ static const ScaleKernel available_kernels[] = #if defined(ARM_COMPUTE_ENABLE_NEON) #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { - "common_neon_scale", + "neon_fp16_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); }, REGISTER_FP16_NEON(arm_compute::cpu::common_neon_scale) }, #endif /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ { - "common_neon_scale", + "neon_fp32_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::F32; }, REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale) }, { - "qasymm8_neon_scale", + "neon_qu8_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8; }, REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale) }, { - "qasymm8_signed_neon_scale", + "neon_qs8_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale) }, { - "common_neon_scale", + "neon_u8_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::U8; }, REGISTER_INTEGER_NEON(arm_compute::cpu::common_neon_scale) }, { - "common_neon_scale", + "neon_s16_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::S16; }, REGISTER_INTEGER_NEON(arm_compute::cpu::common_neon_scale) }, @@ -199,11 +199,6 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const I } } // namespace -CpuScaleKernel::CpuScaleKernel() - : _func(nullptr), _policy(), _border_mode(), _constant_border_value(PixelValue()), _sampling_offset(0), _align_corners(false), _data_layout(DataLayout::UNKNOWN) -{ -} - void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst, const ScaleKernelInfo &info) { @@ -217,6 +212,12 @@ void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, co dst, info)); + const auto *uk = get_implementation(ScaleSelectorData{ src->data_type(), CPUInfo::get() }); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + + _run_method = uk->ukernel; + _name = std::string("CpuScaleKernel").append("/").append(uk->name).append("_").append(string_from_interpolation_policy(info.interpolation_policy)); + // Get data layout and width/height indices _data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); @@ -595,6 +596,7 @@ void CpuScaleKernel::run_op(ITensorPack &tensors, const Window &window, const Th ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); ARM_COMPUTE_ERROR_ON(_func == nullptr && _data_layout == DataLayout::NCHW); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr && _data_layout == DataLayout::NHWC); const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); auto dst = tensors.get_tensor(TensorType::ACL_DST); @@ -608,14 +610,13 @@ void CpuScaleKernel::run_op(ITensorPack &tensors, const Window &window, const Th } else { - const auto *uk = get_implementation(ScaleSelectorData{ src->info()->data_type(), CPUInfo::get() }); - uk->ukernel(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset, _align_corners, window); + _run_method(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset, _align_corners, window); } } const char *CpuScaleKernel::name() const { - return "CpuScaleKernel"; + return _name.c_str(); } } // namespace kernels } // namespace cpu diff --git a/src/core/cpu/kernels/CpuScaleKernel.h b/src/core/cpu/kernels/CpuScaleKernel.h index 24790d16d7..afaf074340 100644 --- a/src/core/cpu/kernels/CpuScaleKernel.h +++ b/src/core/cpu/kernels/CpuScaleKernel.h @@ -39,7 +39,7 @@ class CpuScaleKernel : public ICpuKernel { public: /** Default constructor */ - CpuScaleKernel(); + CpuScaleKernel() = default; ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuScaleKernel); /** Initialise the kernel's inputs, output and interpolation policy * @@ -55,17 +55,11 @@ public: */ void configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst, const ScaleKernelInfo &info); - /** Static function to check if given info will lead to a valid configuration of @ref CpuScaleKernel + /** Static function to check if given info will lead to a valid configuration * - * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor - * @note Using @p policy Area only supports data layout NCHW and input data type U8. + * Similar to CpuScaleKernel::configure() * - * @param[in] src Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. - * @param[in] dx Distance x tensor info. Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32 - * @param[in] dy Distance y tensor info. Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32 - * @param[in] offsets Offset tensor info. Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32. - * @param[in] dst Destination tensor info. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. - * @param[in] info @ref ScaleKernelInfo to use for validation + * @return a status */ static Status validate(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst, const ScaleKernelInfo &info); @@ -96,14 +90,18 @@ private: /** Scale function to use for the particular function to use */ using ScaleFunctionPtr = void (CpuScaleKernel::*)(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const Window &window); + using ScaleKernelPtr = std::add_pointer::type; - ScaleFunctionPtr _func; - InterpolationPolicy _policy; - BorderMode _border_mode; - PixelValue _constant_border_value; - float _sampling_offset; - bool _align_corners; - DataLayout _data_layout; + ScaleFunctionPtr _func{ nullptr }; + InterpolationPolicy _policy{}; + BorderMode _border_mode{}; + PixelValue _constant_border_value{}; + float _sampling_offset{ 0 }; + bool _align_corners{ false }; + DataLayout _data_layout{ DataLayout::UNKNOWN }; + ScaleKernelPtr _run_method{ nullptr }; + std::string _name{}; }; } // namespace kernels } // namespace cpu diff --git a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp b/src/core/cpu/kernels/CpuSoftmaxKernel.cpp index 1e00e12050..c562699092 100644 --- a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp +++ b/src/core/cpu/kernels/CpuSoftmaxKernel.cpp @@ -72,12 +72,12 @@ static const SoftmaxLogits1DKernel available_logits_1d_kernels[] = { #if defined(ARM_COMPUTE_ENABLE_SVE) { - "sve_softmax_logits_1d_float", + "sve_fp32_softmax_logits_1d", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32) && data.ci.has_sve(); }, REGISTER_FP32_SVE(arm_compute::cpu::sve_softmax_logits_1d_float) }, { - "sve_softmax_logits_1d_float", + "sve_fp16_softmax_logits_1d", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16) && data.ci.has_sve(); }, REGISTER_FP16_SVE(arm_compute::cpu::sve_softmax_logits_1d_float) }, @@ -85,13 +85,13 @@ static const SoftmaxLogits1DKernel available_logits_1d_kernels[] = #if defined(ARM_COMPUTE_ENABLE_NEON) { - "neon_softmax_logits_1d_float", + "neon_fp32_softmax_logits_1d", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); }, REGISTER_FP32_NEON(arm_compute::cpu::neon_softmax_logits_1d_float) }, #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { - "neon_softmax_logits_1d_float", + "neon_fp16_softmax_logits_1d", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); }, REGISTER_FP16_NEON(arm_compute::cpu::neon_softmax_logits_1d_float) }, @@ -100,23 +100,23 @@ static const SoftmaxLogits1DKernel available_logits_1d_kernels[] = #if defined(ARM_COMPUTE_ENABLE_SVE2) { - "sve_softmax_logits_1d_quantized", + "sve2_qu8_softmax_logits_1d", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8) && data.ci.has_sve2(); }, REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized) }, { - "sve_softmax_logits_1d_quantized", + "sve2_qs8_softmax_logits_1d", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve2(); }, REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ { - "neon_softmax_logits_1d_quantized", + "neon_qu8_softmax_logits_1d", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); }, REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized) }, { - "neon_softmax_logits_1d_quantized", + "neon_qs8_softmax_logits_1d", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized) }, @@ -126,46 +126,46 @@ static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] = { #if defined(ARM_COMPUTE_ENABLE_SVE) { - "sve_logits_1d_max", + "sve_fp32_logits_1d_max", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32) && data.ci.has_sve(); }, REGISTER_FP32_SVE(arm_compute::cpu::sve_logits_1d_max) }, { - "sve_logits_1d_max", + "sve_fp16_logits_1d_max", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16) && data.ci.has_sve(); }, REGISTER_FP16_SVE(arm_compute::cpu::sve_logits_1d_max) }, { - "sve_logits_1d_max", + "sve_qu8_logits_1d_max", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8) && data.ci.has_sve(); }, REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_logits_1d_max) }, { - "sve_logits_1d_max", + "sve_qs8_logits_1d_max", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve(); }, REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_logits_1d_max) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_NEON) { - "neon_logits_1d_max", + "neon_fp32_logits_1d_max", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); }, REGISTER_FP32_NEON(arm_compute::cpu::neon_logits_1d_max) }, #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { - "neon_logits_1d_max", + "neon_fp16_logits_1d_max", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); }, REGISTER_FP16_NEON(arm_compute::cpu::neon_logits_1d_max) }, #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ { - "neon_logits_1d_max", + "neon_qu8_logits_1d_max", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); }, REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_logits_1d_max) }, { - "neon_logits_1d_max", + "neon_qs8_logits_1d_max", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_logits_1d_max) }, @@ -214,15 +214,9 @@ Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorI } // namespace -CpuLogits1DMaxKernel::CpuLogits1DMaxKernel() -{ -} - void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst)); // Softmax across the x dimension @@ -230,8 +224,13 @@ void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst) // Output auto initialization if not yet initialized auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info()); - Window win = calculate_max_window(*src, Steps()); + const auto *uk = get_implementation_logits_max(SoftmaxSelectorData{ src->data_type(), CPUInfo::get() }); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + + _run_method = uk->ukernel; + _name = std::string("CpuLogits1DMaxKernel").append("/").append(uk->name); + Window win = calculate_max_window(*src, Steps()); ICpuKernel::configure(win); } @@ -248,17 +247,17 @@ void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, co ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); auto dst = tensors.get_tensor(TensorType::ACL_DST); - const auto *uk = get_implementation_logits_max(SoftmaxSelectorData{ src->info()->data_type(), CPUInfo::get() }); - uk->ukernel(src, dst, window); + _run_method(src, dst, window); } const char *CpuLogits1DMaxKernel::name() const { - return "CpuLogits1DMaxKernel"; + return _name.c_str(); } namespace @@ -301,22 +300,12 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn } } // namespace -template -CpuLogits1DSoftmaxKernel::CpuLogits1DSoftmaxKernel() - : _beta(1.0f) -{ -} - template void CpuLogits1DSoftmaxKernel::configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); - ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); - // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG)); - _beta = beta; - // Configure kernel window const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type()); @@ -328,6 +317,15 @@ void CpuLogits1DSoftmaxKernel::configure(const ITensorInfo *src, const I const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type(); auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding()); + const auto *uk = get_implementation_logits(SoftmaxSelectorData{ src->data_type(), CPUInfo::get() }); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + + std::string kernel_name = IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel"); + + _beta = beta; + _run_method = uk->ukernel; + _name = kernel_name.append("/").append(uk->name); + // Configure kernel window Window win = calculate_max_window(*max, Steps()); @@ -350,6 +348,7 @@ void CpuLogits1DSoftmaxKernel::run_op(ITensorPack &tensors, const Window ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); auto max = tensors.get_tensor(TensorType::ACL_SRC_1); @@ -362,22 +361,13 @@ void CpuLogits1DSoftmaxKernel::run_op(ITensorPack &tensors, const Window ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread)); void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread); - - const auto *uk = get_implementation_logits(SoftmaxSelectorData{ src->info()->data_type(), CPUInfo::get() }); - uk->ukernel(src, max, tmp_for_thread, dst, _beta, IS_LOG, window); + _run_method(src, max, tmp_for_thread, dst, _beta, IS_LOG, window); } template const char *CpuLogits1DSoftmaxKernel::name() const { - if(IS_LOG) - { - return "CpuLogits1DSoftmaxKernel"; - } - else - { - return "CpuLogits1DLogSoftmaxKernel"; - } + return _name.c_str(); } template class CpuLogits1DSoftmaxKernel; diff --git a/src/core/cpu/kernels/CpuSoftmaxKernel.h b/src/core/cpu/kernels/CpuSoftmaxKernel.h index aa10467965..2912098c30 100644 --- a/src/core/cpu/kernels/CpuSoftmaxKernel.h +++ b/src/core/cpu/kernels/CpuSoftmaxKernel.h @@ -38,7 +38,7 @@ class CpuLogits1DMaxKernel : public ICpuKernel { public: /** Constructor */ - CpuLogits1DMaxKernel(); + CpuLogits1DMaxKernel() = default; ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DMaxKernel); /** Set the input and output tensors. * @@ -46,10 +46,9 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p input */ void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref CpuLogits1DMaxKernel + /** Static function to check if given info will lead to a valid configuration * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] dst Destination tensor info. Data types supported: same as @p input + * Similar to CpuLogits1DMaxKernel::configure() * * @return a status */ @@ -58,6 +57,13 @@ public: // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; + +private: + using SoftmaxLogits1DMaxKernelPtr = std::add_pointer::type; + +private: + SoftmaxLogits1DMaxKernelPtr _run_method{ nullptr }; + std::string _name{}; }; /** Interface for softmax computation for QASYMM8 with pre-computed max. */ @@ -66,7 +72,7 @@ class CpuLogits1DSoftmaxKernel : public ICpuKernel { public: /** Default constructor */ - CpuLogits1DSoftmaxKernel(); + CpuLogits1DSoftmaxKernel() = default; ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DSoftmaxKernel); /** Set the input and output tensors. @@ -80,14 +86,9 @@ public: * @param tmp Auxiliary tensor info. Must be type F32 and same shape as the input. */ void configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp); - /** Static function to check if given info will lead to a valid configuration of @ref CpuLogits1DSoftmaxKernel + /** Static function to check if given info will lead to a valid configuration * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] max Max values tensor info. Same shape as input with dimension 0 set to 1. - * Data types supported: same as @p input. - * @param[in] dst Destination tensor info. Data types supported: same as @p input. - * @param[in] beta A scaling factor for the exponent. - * @param[in] tmp Tensor info of auxiliary. Must be type F32 and same shape as the input. + * Similar to CpuLogits1DSoftmaxKernel::configure() * * @return a status */ @@ -99,7 +100,12 @@ public: const char *name() const override; private: - float _beta; + using SoftmaxLogits1DKernelPtr = std::add_pointer::type; + +private: + float _beta{ 1.0f }; + SoftmaxLogits1DKernelPtr _run_method{ nullptr }; + std::string _name{}; }; } // namespace kernels } // namespace cpu diff --git a/src/core/cpu/kernels/CpuSubKernel.cpp b/src/core/cpu/kernels/CpuSubKernel.cpp index d7057bbe2b..098a324377 100644 --- a/src/core/cpu/kernels/CpuSubKernel.cpp +++ b/src/core/cpu/kernels/CpuSubKernel.cpp @@ -59,59 +59,59 @@ struct SubKernel static const SubKernel available_kernels[] = { { - "sub_same_neon", + "neon_fp32_sub", [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); }, REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon) }, #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { - "sub_same_neon", + "neon_fp16_sub", [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F16)); }, REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon) }, #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ { - "sub_same_neon", + "neon_u8_sub", [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::U8)); }, REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon) }, { - "sub_same_neon", + "neon_s16_sub", [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S16)); }, REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon) }, { - "sub_same_neon", + "neon_s32_sub", [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S32)); }, REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon) }, { - "sub_u8_s16_s16_neon", + "neon_u8_s16_s16_sub", [](const SubSelectorData & data) { return ((data.dt1 == DataType::U8) && (data.dt2 == DataType::S16)); }, REGISTER_INTEGER_NEON(arm_compute::cpu::sub_u8_s16_s16_neon) }, { - "sub_s16_u8_s16_neon", + "neon_s16_u8_s16_sub", [](const SubSelectorData & data) { return ((data.dt1 == DataType::S16) && (data.dt2 == DataType::U8)); }, REGISTER_INTEGER_NEON(arm_compute::cpu::sub_s16_u8_s16_neon) }, { - "sub_u8_u8_s16_neon", + "neon_u8_u8_s16_sub", [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); }, REGISTER_INTEGER_NEON(arm_compute::cpu::sub_u8_u8_s16_neon) }, { - "sub_qasymm8_neon", + "neon_qu8_sub", [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8)); }, REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon) }, { - "sub_qasymm8_signed_neon", + "neon_qs8_sub", [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8_SIGNED)); }, REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon) }, { - "sub_qsymm16_neon", + "neon_s16_sub", [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)); }, REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon) }, @@ -206,7 +206,12 @@ void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I // Auto initialize dst if not initialized set_shape_if_empty(*dst, out_shape); - _policy = policy; + const auto *uk = get_implementation(src0->data_type(), src1->data_type(), dst->data_type()); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + + _policy = policy; + _run_method = uk->ukernel; + _name = std::string("CpuSubKernel").append("/").append(uk->name); // CpuSubKernel doesn't need padding so update_window_and_padding() can be skipped Window win = calculate_max_window(out_shape, Steps()); @@ -227,19 +232,18 @@ void CpuSubKernel::run_op(ITensorPack &tensors, const Window &window, const Thre ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0); const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); - // Dispatch kernel - const auto *uk = get_implementation(src0->info()->data_type(), src1->info()->data_type(), dst->info()->data_type()); - uk->ukernel(src0, src1, dst, _policy, window); + _run_method(src0, src1, dst, _policy, window); } const char *CpuSubKernel::name() const { - return "CpuSubKernel"; + return _name.c_str(); } } // namespace kernels } // namespace cpu diff --git a/src/core/cpu/kernels/CpuSubKernel.h b/src/core/cpu/kernels/CpuSubKernel.h index da114b6e08..b9160bd150 100644 --- a/src/core/cpu/kernels/CpuSubKernel.h +++ b/src/core/cpu/kernels/CpuSubKernel.h @@ -61,25 +61,9 @@ public: * @param[in] policy Overflow policy. Convert policy cannot be WRAP if datatype is quantized. */ void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy); - /** Static function to check if given info will lead to a valid configuration of @ref CpuSubKernel + /** Static function to check if given info will lead to a valid configuration * - * Valid configurations (src0,src1) -> dst : - * - * - (U8,U8) -> U8 - * - (U8,U8) -> S16 - * - (QASYMM8, QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (S16,U8) -> S16 - * - (U8,S16) -> S16 - * - (S16,S16) -> S16 - * - (S32,S32) -> S32 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - * @param[in] src0 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 - * @param[in] src1 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 - * @param[in] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32. - * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized. + * Similar to CpuSubKernel::configure() * * @return a status */ @@ -89,8 +73,13 @@ public: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; +private: + using SubKernelPtr = std::add_pointer::type; + private: ConvertPolicy _policy{}; + SubKernelPtr _run_method{ nullptr }; + std::string _name{}; }; } // namespace kernels } // namespace cpu -- cgit v1.2.1