From 5fdde99f4271891a40c02cd1e89f1344aa84583a Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 25 Jun 2021 05:42:57 +0100 Subject: Improve selection speed of CPU implementations CPU micro-kernel to be used was picked during kernel execution. Move selection during configuration to reduce runtime overhead. Standardize kernel names as follows: ___ e.g. sve_fp32_nhwc_scale Signed-off-by: Georgios Pinitas Change-Id: I544f1c08c8fef0f130a3bde61882ccb9a1f47f21 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5855 Reviewed-by: Michele Di Giorgio Tested-by: Arm Jenkins --- src/core/cpu/kernels/CpuSoftmaxKernel.cpp | 84 ++++++++++++++----------------- 1 file changed, 37 insertions(+), 47 deletions(-) (limited to 'src/core/cpu/kernels/CpuSoftmaxKernel.cpp') diff --git a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp b/src/core/cpu/kernels/CpuSoftmaxKernel.cpp index 1e00e12050..c562699092 100644 --- a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp +++ b/src/core/cpu/kernels/CpuSoftmaxKernel.cpp @@ -72,12 +72,12 @@ static const SoftmaxLogits1DKernel available_logits_1d_kernels[] = { #if defined(ARM_COMPUTE_ENABLE_SVE) { - "sve_softmax_logits_1d_float", + "sve_fp32_softmax_logits_1d", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32) && data.ci.has_sve(); }, REGISTER_FP32_SVE(arm_compute::cpu::sve_softmax_logits_1d_float) }, { - "sve_softmax_logits_1d_float", + "sve_fp16_softmax_logits_1d", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16) && data.ci.has_sve(); }, REGISTER_FP16_SVE(arm_compute::cpu::sve_softmax_logits_1d_float) }, @@ -85,13 +85,13 @@ static const SoftmaxLogits1DKernel available_logits_1d_kernels[] = #if defined(ARM_COMPUTE_ENABLE_NEON) { - "neon_softmax_logits_1d_float", + "neon_fp32_softmax_logits_1d", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); }, REGISTER_FP32_NEON(arm_compute::cpu::neon_softmax_logits_1d_float) }, #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { - "neon_softmax_logits_1d_float", + "neon_fp16_softmax_logits_1d", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); }, REGISTER_FP16_NEON(arm_compute::cpu::neon_softmax_logits_1d_float) }, @@ -100,23 +100,23 @@ static const SoftmaxLogits1DKernel available_logits_1d_kernels[] = #if defined(ARM_COMPUTE_ENABLE_SVE2) { - "sve_softmax_logits_1d_quantized", + "sve2_qu8_softmax_logits_1d", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8) && data.ci.has_sve2(); }, REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized) }, { - "sve_softmax_logits_1d_quantized", + "sve2_qs8_softmax_logits_1d", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve2(); }, REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ { - "neon_softmax_logits_1d_quantized", + "neon_qu8_softmax_logits_1d", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); }, REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized) }, { - "neon_softmax_logits_1d_quantized", + "neon_qs8_softmax_logits_1d", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized) }, @@ -126,46 +126,46 @@ static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] = { #if defined(ARM_COMPUTE_ENABLE_SVE) { - "sve_logits_1d_max", + "sve_fp32_logits_1d_max", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32) && data.ci.has_sve(); }, REGISTER_FP32_SVE(arm_compute::cpu::sve_logits_1d_max) }, { - "sve_logits_1d_max", + "sve_fp16_logits_1d_max", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16) && data.ci.has_sve(); }, REGISTER_FP16_SVE(arm_compute::cpu::sve_logits_1d_max) }, { - "sve_logits_1d_max", + "sve_qu8_logits_1d_max", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8) && data.ci.has_sve(); }, REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_logits_1d_max) }, { - "sve_logits_1d_max", + "sve_qs8_logits_1d_max", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve(); }, REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_logits_1d_max) }, #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_NEON) { - "neon_logits_1d_max", + "neon_fp32_logits_1d_max", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); }, REGISTER_FP32_NEON(arm_compute::cpu::neon_logits_1d_max) }, #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { - "neon_logits_1d_max", + "neon_fp16_logits_1d_max", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); }, REGISTER_FP16_NEON(arm_compute::cpu::neon_logits_1d_max) }, #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ { - "neon_logits_1d_max", + "neon_qu8_logits_1d_max", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); }, REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_logits_1d_max) }, { - "neon_logits_1d_max", + "neon_qs8_logits_1d_max", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_logits_1d_max) }, @@ -214,15 +214,9 @@ Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorI } // namespace -CpuLogits1DMaxKernel::CpuLogits1DMaxKernel() -{ -} - void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst)); // Softmax across the x dimension @@ -230,8 +224,13 @@ void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst) // Output auto initialization if not yet initialized auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info()); - Window win = calculate_max_window(*src, Steps()); + const auto *uk = get_implementation_logits_max(SoftmaxSelectorData{ src->data_type(), CPUInfo::get() }); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + + _run_method = uk->ukernel; + _name = std::string("CpuLogits1DMaxKernel").append("/").append(uk->name); + Window win = calculate_max_window(*src, Steps()); ICpuKernel::configure(win); } @@ -248,17 +247,17 @@ void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, co ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); auto dst = tensors.get_tensor(TensorType::ACL_DST); - const auto *uk = get_implementation_logits_max(SoftmaxSelectorData{ src->info()->data_type(), CPUInfo::get() }); - uk->ukernel(src, dst, window); + _run_method(src, dst, window); } const char *CpuLogits1DMaxKernel::name() const { - return "CpuLogits1DMaxKernel"; + return _name.c_str(); } namespace @@ -301,22 +300,12 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn } } // namespace -template -CpuLogits1DSoftmaxKernel::CpuLogits1DSoftmaxKernel() - : _beta(1.0f) -{ -} - template void CpuLogits1DSoftmaxKernel::configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); - ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); - // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG)); - _beta = beta; - // Configure kernel window const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type()); @@ -328,6 +317,15 @@ void CpuLogits1DSoftmaxKernel::configure(const ITensorInfo *src, const I const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type(); auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding()); + const auto *uk = get_implementation_logits(SoftmaxSelectorData{ src->data_type(), CPUInfo::get() }); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + + std::string kernel_name = IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel"); + + _beta = beta; + _run_method = uk->ukernel; + _name = kernel_name.append("/").append(uk->name); + // Configure kernel window Window win = calculate_max_window(*max, Steps()); @@ -350,6 +348,7 @@ void CpuLogits1DSoftmaxKernel::run_op(ITensorPack &tensors, const Window ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); auto max = tensors.get_tensor(TensorType::ACL_SRC_1); @@ -362,22 +361,13 @@ void CpuLogits1DSoftmaxKernel::run_op(ITensorPack &tensors, const Window ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread)); void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread); - - const auto *uk = get_implementation_logits(SoftmaxSelectorData{ src->info()->data_type(), CPUInfo::get() }); - uk->ukernel(src, max, tmp_for_thread, dst, _beta, IS_LOG, window); + _run_method(src, max, tmp_for_thread, dst, _beta, IS_LOG, window); } template const char *CpuLogits1DSoftmaxKernel::name() const { - if(IS_LOG) - { - return "CpuLogits1DSoftmaxKernel"; - } - else - { - return "CpuLogits1DLogSoftmaxKernel"; - } + return _name.c_str(); } template class CpuLogits1DSoftmaxKernel; -- cgit v1.2.1