diff options
Diffstat (limited to 'src/cpu/kernels')
18 files changed, 380 insertions, 788 deletions
diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h index b7daa4d583..45ebeec394 100644 --- a/src/cpu/kernels/CpuKernelSelectionTypes.h +++ b/src/cpu/kernels/CpuKernelSelectionTypes.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H -#define ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H +#ifndef ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H +#define ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H #include "arm_compute/core/Types.h" @@ -99,6 +99,13 @@ struct ScaleKernelDataTypeISASelectorData InterpolationPolicy interpolation_policy; }; +struct SoftmaxKernelDataTypeISASelectorData +{ + DataType dt; + cpuinfo::CpuIsaInfo isa; + bool is_log; +}; + // Selector pointer types using DataTypeISASelectorPtr = std::add_pointer<bool(const DataTypeISASelectorData &data)>::type; using DataTypeDataLayoutSelectorPtr = std::add_pointer<bool(const DataTypeDataLayoutISASelectorData &data)>::type; @@ -113,9 +120,10 @@ using CpuAddKernelDataTypeISASelectorDataPtr = std::add_pointer<bool(const CpuAddKernelDataTypeISASelectorData &data)>::type; using ScaleKernelDataTypeISASelectorDataPtr = std::add_pointer<bool(const ScaleKernelDataTypeISASelectorData &data)>::type; - +using SoftmaxKernelDataTypeISASelectorDataPtr = + std::add_pointer<bool(const SoftmaxKernelDataTypeISASelectorData &data)>::type; } // namespace kernels } // namespace cpu } // namespace arm_compute -#endif // ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H +#endif // ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp index ce144351f8..486f55e2c1 100644 --- a/src/cpu/kernels/CpuSoftmaxKernel.cpp +++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp @@ -34,9 +34,12 @@ #include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/Utils.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/softmax/list.h" +#include <vector> + namespace arm_compute { namespace cpu @@ -45,136 +48,40 @@ namespace kernels { namespace { -/* Softmax Logits 1D Max - identifying the max value of 1D Logits */ -static const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> available_kernels_max_logits = { - {"sve_fp32_logits_1d_max", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; }, - REGISTER_FP32_SVE(sve_fp32_logits)}, - {"sve_fp16_logits_1d_max", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; }, - REGISTER_FP16_SVE(sve_fp16_logits)}, - {"sve_qu8_logits_1d_max", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; }, - REGISTER_QASYMM8_SVE(sve_qasymm8_logits)}, - {"sve_qs8_logits_1d_max", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; }, - REGISTER_QASYMM8_SIGNED_SVE(sve_qasymm8_signed_logits)}, - {"neon_fp32_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(neon_fp32_logits)}, - {"neon_fp16_logits_1d_max", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, - REGISTER_FP16_NEON(neon_fp16_logits)}, - {"neon_qu8_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(neon_qasymm8_logits)}, - {"neon_qs8_logits_1d_max", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_singed_logits)}, +/* Softmax */ +static const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> available_kernels = { + {"neon_fp32_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::F32); }, + REGISTER_FP32_NEON(neon_fp32_softmax<false>)}, + {"neon_fp16_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) + { return (!data.is_log && data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_softmax<false>)}, + {"neon_qu8_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax<false>)}, + {"neon_qs8_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) + { return (!data.is_log && data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax<false>)}, + {"neon_fp32_log_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) { return (data.is_log && data.dt == DataType::F32); }, + REGISTER_FP32_NEON(neon_fp32_softmax<true>)}, + {"neon_fp16_log_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) + { return (data.is_log && data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_softmax<true>)}, + {"neon_qu8_log_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) { return (data.is_log && data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax<true>)}, + {"neon_qs8_log_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) + { return (data.is_log && data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax<true>)}, }; -Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::F16, DataType::F32); - - // Validate in case of configured output - if (output.total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), - TensorShape(input.tensor_shape()).set(0, 1)); - } - - return Status{}; -} -} //namespace -const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> &CpuLogits1DMaxKernel::get_available_kernels() -{ - return available_kernels_max_logits; -} - -void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst)); - - // Softmax across the x dimension - const TensorShape output_shape = TensorShape(src->tensor_shape()).set(0, 1); - // Output auto initialization if not yet initialized - auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info()); - - const auto *uk = get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); - ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - - _run_method = uk->ukernel; - _name = std::string("CpuLogits1DMaxKernel").append("/").append(uk->name); - - Window win = calculate_max_window(*src, Steps()); - ICpuKernel::configure(win); -} - -Status CpuLogits1DMaxKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*src, *dst)); - - return Status{}; -} - -void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_run_method == nullptr); - - const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - _run_method(src, dst, window); -} - -const char *CpuLogits1DMaxKernel::name() const -{ - return _name.c_str(); -} - -/* Softmax Logits 1D - computation for QASYMM8 with pre-computed max. */ -template <bool IS_LOG> -static const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> available_kernels_logits = { - {"sve2_qu8_softmax_logits_1d", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; }, - REGISTER_QASYMM8_SVE2(sve2_qasymm8_softmax)}, - {"sve2_qs8_softmax_logits_1d", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; }, - REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_softmax)}, - {"sve_fp32_softmax_logits_1d", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; }, - REGISTER_FP32_SVE(sve_fp32_softmax)}, - {"sve_fp16_softmax_logits_1d", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; }, - REGISTER_FP16_SVE(sve_fp16_softmax)}, - - {"neon_fp32_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(neon_fp32_softmax)}, - {"neon_fp16_softmax_logits_1d", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, - REGISTER_FP16_NEON(neon_fp16_softmax)}, - {"neon_qu8_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)}, - {"neon_qs8_softmax_logits_1d", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)}, -}; -namespace -{ -Status validate_arguments_logits_softmax(const ITensorInfo &src, - const ITensorInfo &max, - const ITensorInfo &dst, - const float beta, - const ITensorInfo &tmp, - bool is_log) +Status validate_arguments_softmax( + const ITensorInfo &src, const ITensorInfo &dst, float beta, const ITensorInfo &tmp, bool is_log) { ARM_COMPUTE_UNUSED(beta); // Check input @@ -184,11 +91,6 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type()); - // Check max - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(src.tensor_shape()).set(0, 1), max.tensor_shape()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max); - // Check output if configured if (dst.total_size() != 0) { @@ -203,8 +105,11 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, // Check tmp if configured if (tmp.total_size() != 0) { - const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type(); - ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type); + // We have temporary storage only if src data type is quantized. + // Therefore, tmp data type must be F32 + ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(!is_quantized_asymmetric); + // We could potentially reduce tmp memory if we could predict or make an assumption // on the maximum number of threads that will run in parallel. ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &tmp); @@ -214,91 +119,97 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, } } // namespace -template <bool IS_LOG> -const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> & -CpuLogits1DSoftmaxKernel<IS_LOG>::get_available_kernels() +const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> &CpuSoftmaxKernel::get_available_kernels() { - return available_kernels_logits<IS_LOG>; + return available_kernels; } -template <bool IS_LOG> -void CpuLogits1DSoftmaxKernel<IS_LOG>::configure( - const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp) +void CpuSoftmaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, bool is_log, ITensorInfo *tmp) { - ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, tmp); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_softmax(*src, *dst, beta, *tmp, is_log)); // Configure kernel window const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type()); // Output auto initialization if not yet initialized const QuantizationInfo output_quantization = - is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) + is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), is_log) : dst->quantization_info(); auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding()); - // Tmp auto initialization if not yet initialized - const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type(); - auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding()); + // Tmp auto initialization if not yet initialized and src is quantized + if (is_quantized_asymmetric) + { + const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type(); + auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding()); + } - const auto *uk = CpuLogits1DSoftmaxKernel<IS_LOG>::get_implementation( - DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); + const auto *uk = CpuSoftmaxKernel::get_implementation( + SoftmaxKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), is_log}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - std::string kernel_name = - IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel"); + std::string kernel_name = is_log ? std::string("CpuLogSoftmaxKernel") : std::string("CpuSoftmaxKernel"); _beta = beta; _run_method = uk->ukernel; _name = kernel_name.append("/").append(uk->name); - // Configure kernel window - Window win = calculate_max_window(*max, Steps()); + Window win = calculate_max_window(*dst, Steps()); + + /// TODO: Check dimensions > 0 for holes only. For this, we need + /// a utility function checking if there are holes after some dimension. + if (!has_holes(*dst, dst->num_dimensions() - 1)) + { + win = win.collapse(win, Window::DimY); + } - ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>::configure(win); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); // First dimension is the reduction axis + + ICpuKernel<CpuSoftmaxKernel>::configure(win); } -template <bool IS_LOG> -Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate( - const ITensorInfo *src, const ITensorInfo *max, const ITensorInfo *dst, const float beta, const ITensorInfo *tmp) +Status CpuSoftmaxKernel::validate( + const ITensorInfo *src, const ITensorInfo *dst, float beta, bool is_log, const ITensorInfo *tmp) { - ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, tmp); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_softmax(*src, *dst, beta, *tmp, is_log)); return Status{}; } -template <bool IS_LOG> -void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +void CpuSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { - ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>::window(), window); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel<CpuSoftmaxKernel>::window(), window); ARM_COMPUTE_ERROR_ON(_run_method == nullptr); const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto max = tensors.get_tensor(TensorType::ACL_SRC_1); auto dst = tensors.get_tensor(TensorType::ACL_DST_0); - auto tmp = tensors.get_tensor(TensorType::ACL_DST_1); - const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x(); - const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration; + if (is_data_type_quantized_asymmetric(src->info()->data_type())) + { + auto tmp = tensors.get_tensor(TensorType::ACL_DST_1); + + const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x(); + const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration; - ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread)); + ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread)); - void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread); - _run_method(src, max, tmp_for_thread, dst, _beta, IS_LOG, window); + void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread); + _run_method(src, tmp_for_thread, dst, _beta, window); + } + else + { + _run_method(src, nullptr, dst, _beta, window); + } } -template <bool IS_LOG> -const char *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const +const char *CpuSoftmaxKernel::name() const { return _name.c_str(); } -template class CpuLogits1DSoftmaxKernel<true>; -template class CpuLogits1DSoftmaxKernel<false>; - } // namespace kernels } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h index 5d288179fd..3db1f3d0ef 100644 --- a/src/cpu/kernels/CpuSoftmaxKernel.h +++ b/src/cpu/kernels/CpuSoftmaxKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H -#define ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H +#ifndef ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H +#define ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -33,102 +33,55 @@ namespace cpu { namespace kernels { -/** Interface for the identifying the max value of 1D Logits */ -class CpuLogits1DMaxKernel : public ICpuKernel<CpuLogits1DMaxKernel> +/** Interface for softmax computation */ +class CpuSoftmaxKernel : public ICpuKernel<CpuSoftmaxKernel> { private: - using SoftmaxLogits1DMaxKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &)>::type; + using SoftmaxKernelPtr = + std::add_pointer<void(const ITensor *, void *const, ITensor *, float, const Window &)>::type; public: - CpuLogits1DMaxKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DMaxKernel); - /** Set the input and output tensors. - * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[out] dst Destination tensor info. Data types supported: same as @p input - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuLogits1DMaxKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - - struct SoftmaxLogits1DMaxKernel - { - const char *name; - const DataTypeISASelectorPtr is_selected; - SoftmaxLogits1DMaxKernelPtr ukernel; - }; - - static const std::vector<SoftmaxLogits1DMaxKernel> &get_available_kernels(); - -private: - SoftmaxLogits1DMaxKernelPtr _run_method{nullptr}; - std::string _name{}; -}; - -/** Interface for softmax computation for QASYMM8 with pre-computed max. */ -template <bool IS_LOG = false> -class CpuLogits1DSoftmaxKernel : public ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>> -{ -private: - using SoftmaxLogits1DKernelPtr = std::add_pointer<void( - const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type; - -public: - CpuLogits1DSoftmaxKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DSoftmaxKernel); + CpuSoftmaxKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuSoftmaxKernel); /** Set the input and output tensors. * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] max Max values tensor info. Same shape as input with dimension 0 set to 1. - * Data types supported: same as @p input. - * @param[out] dst Destination tensor info. Data types supported: same as @p input. - * @param[in] beta A scaling factor for the exponent. + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p input. + * @param[in] beta A scaling factor for the exponent. + * @param[in] is_log True if the operation is log-softmax * * @param tmp Auxiliary tensor info. Must be type F32 and same shape as the input. */ - void - configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp); + void configure(const ITensorInfo *src, ITensorInfo *dst, float beta, bool is_log, ITensorInfo *tmp); /** Static function to check if given info will lead to a valid configuration * - * Similar to CpuLogits1DSoftmaxKernel::configure() + * Similar to CpuSoftmaxKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, - const ITensorInfo *max, - const ITensorInfo *dst, - const float beta, - const ITensorInfo *tmp); + static Status + validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, bool is_log, const ITensorInfo *tmp); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; - struct SoftmaxLogits1DKernel + struct SoftmaxKernel { - const char *name; - const DataTypeISASelectorPtr is_selected; - SoftmaxLogits1DKernelPtr ukernel; + const char *name; + const SoftmaxKernelDataTypeISASelectorDataPtr is_selected; + SoftmaxKernelPtr ukernel; }; - static const std::vector<SoftmaxLogits1DKernel> &get_available_kernels(); + static const std::vector<SoftmaxKernel> &get_available_kernels(); private: - float _beta{1.0f}; - SoftmaxLogits1DKernelPtr _run_method{nullptr}; - std::string _name{}; + float _beta{1.0f}; + SoftmaxKernelPtr _run_method{nullptr}; + std::string _name{}; }; } // namespace kernels } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H */ +#endif // ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp index 2e2adf33e0..db8f881712 100644 --- a/src/cpu/kernels/softmax/generic/neon/fp16.cpp +++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp @@ -31,21 +31,18 @@ namespace arm_compute { namespace cpu { -void neon_fp16_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return neon_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window); -} -void neon_fp16_logits(const ITensor *in, ITensor *out, const Window &window) +template <bool IS_LOG> +void neon_fp16_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window) { - return neon_logits_1d_max<float16_t>(in, out, window); + return neon_softmax_float<float16_t, IS_LOG>(in, tmp, out, beta, window); } + +template void +neon_fp16_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); +template void +neon_fp16_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); + } // namespace cpu } // namespace arm_compute #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp index 61df40c1b5..c281d1bf31 100644 --- a/src/cpu/kernels/softmax/generic/neon/fp32.cpp +++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,20 +29,17 @@ namespace arm_compute { namespace cpu { -void neon_fp32_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return neon_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window); -} -void neon_fp32_logits(const ITensor *in, ITensor *out, const Window &window) +template <bool IS_LOG> +void neon_fp32_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window) { - return neon_logits_1d_max<float>(in, out, window); + return neon_softmax_float<float, IS_LOG>(in, tmp, out, beta, window); } + +template void +neon_fp32_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); +template void +neon_fp32_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); + } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp index 5d6e6a4f80..487f6ae051 100644 --- a/src/cpu/kernels/softmax/generic/neon/impl.cpp +++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp @@ -29,43 +29,76 @@ namespace arm_compute { namespace cpu { -template void neon_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window); -template void neon_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window); - -template <typename T> -void neon_softmax_logits_1d_quantized( - const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window) +template <typename T, bool IS_LOG> +void neon_softmax_quantized(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window) { static_assert(std::is_same<T, qasymm8_t>::value || std::is_same<T, qasymm8_signed_t>::value, "quantized type should be either qasymm8_t or qasymm8_signed_t."); - const int start_x = in->info()->valid_region().anchor.x(); const int input_width = in->info()->valid_region().shape.x(); - const float scale_beta = -beta * in->info()->quantization_info().uniform().scale; - const auto scale_beta_vec = vdupq_n_f32(scale_beta); + const float scale_beta = -beta * in->info()->quantization_info().uniform().scale; + const float32x4_t scale_beta_vec = vdupq_n_f32(scale_beta); + + Iterator in_it(in, window); + Iterator out_it(out, window); - Iterator in_it(in, window); - Iterator max_it(max, window); - Iterator out_it(out, window); constexpr int vec_size = 16; +#ifndef __aarch64__ + const int sum_stages = log2(vec_size >> 1); +#endif // __aarch64__ + + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + execute_window_loop( window, [&](const Coordinates &) { /* Get pointers */ - const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast<float *>(tmp); + const T *in_ptr = reinterpret_cast<const T *>(in_it.ptr()); + T *out_ptr = reinterpret_cast<T *>(out_it.ptr()); + float *tmp_ptr = reinterpret_cast<float *>(tmp); + + T max_val; + + /* Compute Max */ + { + // Init max value + auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{}); + int x = 0; - float sum{}; - float sum_inversed{}; + for (; x <= (input_width - vec_size); x += vec_size) + { + const auto current_value = wrapper::vloadq(in_ptr + x); + vec_max = wrapper::vmax(vec_max, current_value); + } + +#ifdef __aarch64__ + max_val = wrapper::vmaxv(vec_max); +#else // __aarch64__ + auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); + + for (int i = 0; i < sum_stages; ++i) + { + carry_max = wrapper::vpmax(carry_max, carry_max); + } + + max_val = wrapper::vgetlane(carry_max, 0); +#endif // __aarch64__ + + // Compute left-over elements + for (; x < input_width; ++x) + { + max_val = std::max(*(in_ptr + x), max_val); + } + } // Compute Max + + float sum_transformed{}; /* Compute exponentials and sum */ { /* Get max value */ - const auto max_val = *reinterpret_cast<const T *>(max_it.ptr()); const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{}); /* Init sum to zero */ @@ -80,11 +113,11 @@ void neon_softmax_logits_1d_quantized( int x = 0; for (; x <= (input_width - vec_size); x += vec_size) { - auto vec_elements = wrapper::vloadq(in_ptr + x); - vec_elements = wrapper::vqsub(vec_max, vec_elements); - auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements); + auto vec_elements = wrapper::vloadq(in_ptr + x); + vec_elements = wrapper::vqsub(vec_max, vec_elements); + float32x4x4_t vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements); - if (is_log) + if (IS_LOG) { vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec); vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec); @@ -111,17 +144,24 @@ void neon_softmax_logits_1d_quantized( } /* Reduce sum */ - const auto sum_16_byte = + const float32x4_t sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3])); + + float sum; + +#ifdef __aarch64__ + sum = wrapper::vaddv(sum_16_byte); +#else // __aarch64__ auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte)); sum_res = vpadd_f32(sum_res, sum_res); sum = wrapper::vgetlane(sum_res, 0); +#endif // __aarch64__ /* Run remaining elements */ for (; x < input_width; ++x) { float element{}; - if (is_log) + if (IS_LOG) { element = (max_val - in_ptr[x]) * scale_beta; sum += std::exp(element); @@ -135,19 +175,22 @@ void neon_softmax_logits_1d_quantized( tmp_ptr[x] = element; } - if (!is_log) + if (!IS_LOG) { - sum_inversed = 256.f / sum; + sum_transformed = 256.f / sum; } else { - sum = std::log(sum); + sum_transformed = std::log(sum); } - } + } // Compute exponentials and sum /* Normalize exponentials */ { constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value; + + const float32x4_t sum_vec = vdupq_n_f32(sum_transformed); + /* Loop over row and compute softmax */ int x = 0; for (; x <= (input_width - vec_size); x += vec_size) @@ -155,23 +198,23 @@ void neon_softmax_logits_1d_quantized( using int_vec_type = wrapper::traits::neon_vector_t<T, 16>; float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x); int_vec_type normalized_value{}; - if (is_log) + if (IS_LOG) { const float32x4x4_t sub = { - vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[0], sum_vec), + vsubq_f32(vec_in.val[1], sum_vec), + vsubq_f32(vec_in.val[2], sum_vec), + vsubq_f32(vec_in.val[3], sum_vec), }; normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub); } else { float32x4x4_t mul = { - vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[0], sum_vec), + vmulq_f32(vec_in.val[1], sum_vec), + vmulq_f32(vec_in.val[2], sum_vec), + vmulq_f32(vec_in.val[3], sum_vec), }; if (is_qasymm8_signed) @@ -190,34 +233,31 @@ void neon_softmax_logits_1d_quantized( /* Run remaining elements */ for (; x < input_width; ++x) { - if (is_log) + if (IS_LOG) { - out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum); + out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum_transformed); } else { - out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - + out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_transformed) - (is_qasymm8_signed ? 128.f : 0)); } } - } + } // Normalize exponentials }, - in_it, max_it, out_it); + in_it, out_it); } -template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - float beta, - bool is_log, - const Window &window); -template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - float beta, - bool is_log, - const Window &window); +template void neon_softmax_quantized<qasymm8_signed_t, true>( + const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window); + +template void neon_softmax_quantized<qasymm8_signed_t, false>( + const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window); + +template void neon_softmax_quantized<qasymm8_t, true>( + const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window); + +template void neon_softmax_quantized<qasymm8_t, false>( + const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/impl.h b/src/cpu/kernels/softmax/generic/neon/impl.h index 4d9b789297..60380cd233 100644 --- a/src/cpu/kernels/softmax/generic/neon/impl.h +++ b/src/cpu/kernels/softmax/generic/neon/impl.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H -#define SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H +#ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H +#define ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H #include "arm_compute/core/Helpers.h" @@ -33,105 +33,100 @@ namespace arm_compute { namespace cpu { -template <typename T> -void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) -{ - /** SIMD vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; - - constexpr int window_step_x = 16 / sizeof(T); - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - Window win{window}; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator input(in, win); - Iterator output(out, win); - - const int sum_stages = log2(window_step_x / 2); - execute_window_loop( - win, - [&](const Coordinates &) - { - // Get pointers - const auto in_ptr = reinterpret_cast<const T *>(input.ptr()); - const auto out_ptr = reinterpret_cast<T *>(output.ptr()); - - // Init max value - auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{}); - int x = window_start_x; - - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto current_value = wrapper::vloadq(in_ptr + x); - vec_max = wrapper::vmax(vec_max, current_value); - } - auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); - - for (int i = 0; i < sum_stages; ++i) - { - carry_max = wrapper::vpmax(carry_max, carry_max); - } - T max_val = wrapper::vgetlane(carry_max, 0); - // Compute left-over elements - for (; x < window_end_x; ++x) - { - max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val; - } +#ifdef __aarch64__ +namespace +{ +// These helper functions are added because vaddv does not exist for fp16, +// and, therefore, is not part of the wrapper::vaddv interface. +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +inline float16_t wrapper_vaddv(const float16x8_t &a, int sum_stages) +{ + auto sum_res = wrapper::vpadd(wrapper::vgethigh(a), wrapper::vgetlow(a)); + for (int i = 0; i < sum_stages; ++i) + { + sum_res = wrapper::vpadd(sum_res, sum_res); + } + return wrapper::vgetlane(sum_res, 0); +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - *out_ptr = max_val; - }, - input, output); +inline float wrapper_vaddv(const float32x4_t &a, int sum_stages) +{ + ARM_COMPUTE_UNUSED(sum_stages); + return wrapper::vaddv(a); } +} // namespace +#endif // __aarch64__ -template <typename T> -void neon_softmax_logits_1d_quantized(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - float beta, - bool is_log, - const Window &window); - -template <typename T> -void neon_softmax_logits_1d_float(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) +// The template implementation for float data types is stored in the header file because +// we need all fp16 instantiated code to live in fp16.cpp files. +template <typename T, bool IS_LOG> +void neon_softmax_float(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window) { - const int start_x = in->info()->valid_region().anchor.x(); + ARM_COMPUTE_UNUSED(tmp); + const int input_width = in->info()->valid_region().shape.x(); Iterator in_it(in, window); - Iterator max_it(max, window); Iterator out_it(out, window); /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; - constexpr int vec_size = 16 / sizeof(T); - const int sum_stages = log2(vec_size / 2); + constexpr int vec_size = 16 / sizeof(T); + + const int sum_stages = log2(vec_size >> 1); + + const auto beta_vec = wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}); execute_window_loop( window, [&](const Coordinates &) { /* Get pointers */ - const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast<T *>(tmp); + const T *in_ptr = reinterpret_cast<const T *>(in_it.ptr()); + T *out_ptr = reinterpret_cast<T *>(out_it.ptr()); + + T max_val; + + /* Compute Max */ + { + // Init max value + auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{}); + int x = 0; + + for (; x <= (input_width - vec_size); x += vec_size) + { + const auto current_value = wrapper::vloadq(in_ptr + x); + vec_max = wrapper::vmax(vec_max, current_value); + } + +#ifdef __aarch64__ + max_val = wrapper::vmaxv(vec_max); +#else // __aarch64__ + auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); + + for (int i = 0; i < sum_stages; ++i) + { + carry_max = wrapper::vpmax(carry_max, carry_max); + } + + max_val = wrapper::vgetlane(carry_max, 0); +#endif // __aarch64__ - T sum{}; - T sum_inversed{}; + // Compute left-over elements + for (; x < input_width; ++x) + { + max_val = std::max(*(in_ptr + x), max_val); + } + } // compute max + + T sum_transformed{}; /* Compute exponentials and sum */ { /* Get max value */ - const auto max_val = *reinterpret_cast<const T *>(max_it.ptr()); const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{}); /* Init sum to zero */ @@ -143,35 +138,38 @@ void neon_softmax_logits_1d_float(const ITensor *in, { auto vec_elements = wrapper::vloadq(in_ptr + x); vec_elements = wrapper::vsub(vec_elements, vec_max); - if (is_log) + if (IS_LOG) { - vec_elements = - wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})); - vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); + vec_elements = wrapper::vmul(vec_elements, beta_vec); + vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); } else { - vec_elements = wrapper::vexpq( - wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}))); - vec_sum = wrapper::vadd(vec_sum, vec_elements); + vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, beta_vec)); + vec_sum = wrapper::vadd(vec_sum, vec_elements); } - wrapper::vstore(tmp_ptr + x, vec_elements); + wrapper::vstore(out_ptr + x, vec_elements); } /* Reduce sum */ + T sum{}; +#ifdef __aarch64__ + sum = wrapper_vaddv(vec_sum, sum_stages); +#else // __aarch64__ auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum)); for (int i = 0; i < sum_stages; ++i) { sum_res = wrapper::vpadd(sum_res, sum_res); } sum = wrapper::vgetlane(sum_res, 0); +#endif // __aarch64__ /* Run remaining elements */ for (; x < input_width; ++x) { T element{}; - if (is_log) + if (IS_LOG) { element = (in_ptr[x] - max_val) * beta; sum += std::exp(element); @@ -181,55 +179,59 @@ void neon_softmax_logits_1d_float(const ITensor *in, element = std::exp((in_ptr[x] - max_val) * beta); sum += element; } - tmp_ptr[x] = element; + + out_ptr[x] = element; } - if (!is_log) + if (!IS_LOG) { - sum_inversed = T(1) / sum; + sum_transformed = T(1) / sum; } else { - sum = static_cast<T>(std::log(sum)); + sum_transformed = static_cast<T>(std::log(sum)); } - } + } // Compute exponentials and sum /* Normalize exponentials */ { + const auto sum_vec = wrapper::vdup_n(static_cast<T>(sum_transformed), ExactTagType{}); + /* Loop over row and compute softmax */ int x = 0; for (; x <= (input_width - vec_size); x += vec_size) { - auto vec_in = wrapper::vloadq(tmp_ptr + x); - auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); - if (is_log) + const auto vec_in = wrapper::vloadq(out_ptr + x); + if (IS_LOG) { - normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{})); + wrapper::vstore(out_ptr + x, wrapper::vsub(vec_in, sum_vec)); } else { - normalized_value = - wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{})); + wrapper::vstore(out_ptr + x, wrapper::vmul(vec_in, sum_vec)); } - wrapper::vstore(out_ptr + x, normalized_value); } + /* Run remaining elements */ for (; x < input_width; ++x) { - if (is_log) + if (IS_LOG) { - out_ptr[x] = tmp_ptr[x] - sum; + out_ptr[x] = out_ptr[x] - sum_transformed; } else { - out_ptr[x] = tmp_ptr[x] * sum_inversed; + out_ptr[x] = out_ptr[x] * sum_transformed; } } - } + } // Normalize exponentials }, - in_it, max_it, out_it); + in_it, out_it); } + +template <typename T, bool IS_LOG> +void neon_softmax_quantized(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window); } // namespace cpu } // namespace arm_compute -#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H */ +#endif // ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp index 40713dc496..9589ebcd7c 100644 --- a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,20 +29,16 @@ namespace arm_compute { namespace cpu { -void neon_qasymm8_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) +template <bool IS_LOG> +void neon_qasymm8_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window) { - return neon_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window); + return neon_softmax_quantized<qasymm8_t, IS_LOG>(in, tmp, out, beta, window); } -void neon_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return neon_logits_1d_max<qasymm8_t>(in, out, window); -} +template void +neon_qasymm8_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); +template void +neon_qasymm8_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); + } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp index 2c5e284f54..0bf6b2859a 100644 --- a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,20 +29,17 @@ namespace arm_compute { namespace cpu { -void neon_qasymm8_signed_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) +template <bool IS_LOG> +void neon_qasymm8_signed_softmax( + const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window) { - return neon_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window); + return neon_softmax_quantized<qasymm8_signed_t, IS_LOG>(in, tmp, out, beta, window); } -void neon_qasymm8_singed_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return neon_logits_1d_max<qasymm8_signed_t>(in, out, window); -} +template void neon_qasymm8_signed_softmax<true>( + const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); +template void neon_qasymm8_signed_softmax<false>( + const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); + } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/fp16.cpp b/src/cpu/kernels/softmax/generic/sve/fp16.cpp deleted file mode 100644 index 5e94f72faf..0000000000 --- a/src/cpu/kernels/softmax/generic/sve/fp16.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2021-2023 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/CpuTypes.h" -#include "src/cpu/kernels/softmax/generic/sve/impl.h" -namespace arm_compute -{ -namespace cpu -{ -void sve_fp16_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return sve_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window); -} - -void sve_fp16_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return sve_logits_1d_max<float16_t>(in, out, window); -} -} // namespace cpu -} // namespace arm_compute -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/softmax/generic/sve/fp32.cpp b/src/cpu/kernels/softmax/generic/sve/fp32.cpp deleted file mode 100644 index d692cc2477..0000000000 --- a/src/cpu/kernels/softmax/generic/sve/fp32.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2021-2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/kernels/softmax/generic/sve/impl.h" - -namespace arm_compute -{ -namespace cpu -{ -void sve_fp32_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return sve_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window); -} - -void sve_fp32_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return sve_logits_1d_max<float>(in, out, window); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/impl.cpp b/src/cpu/kernels/softmax/generic/sve/impl.cpp index 24f1bb8143..0d4b7f4509 100644 --- a/src/cpu/kernels/softmax/generic/sve/impl.cpp +++ b/src/cpu/kernels/softmax/generic/sve/impl.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,6 +30,9 @@ namespace arm_compute { namespace cpu { +/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to +/// a single kernel that performs softmax operation. Leaving the SVE code here for +/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500 template <typename ScalarType> void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) { @@ -172,25 +175,5 @@ void sve_softmax_logits_1d_float(const ITensor *in, }, in_it, max_it, out_it); } - -template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window); -template void sve_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, const Window &window); -template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window); -template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window); - -template void sve_softmax_logits_1d_float<float>(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window); -template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp deleted file mode 100644 index 85e5ccfea1..0000000000 --- a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2021-2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/kernels/softmax/generic/sve/impl.h" - -namespace arm_compute -{ -namespace cpu -{ -void sve_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return sve_logits_1d_max<qasymm8_t>(in, out, window); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp deleted file mode 100644 index 4be2e2eed6..0000000000 --- a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2021-2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/kernels/softmax/generic/sve/impl.h" - -namespace arm_compute -{ -namespace cpu -{ -void sve_qasymm8_signed_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return sve_logits_1d_max<qasymm8_signed_t>(in, out, window); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.cpp b/src/cpu/kernels/softmax/generic/sve2/impl.cpp index 98b2f5117f..a8fb1d4adf 100644 --- a/src/cpu/kernels/softmax/generic/sve2/impl.cpp +++ b/src/cpu/kernels/softmax/generic/sve2/impl.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,6 +32,9 @@ namespace arm_compute { namespace cpu { +/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to +/// a single kernel that performs softmax operation. Leaving the SVE2 code here for +/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500 template <typename ScalarType> void sve2_softmax_logits_1d_quantized( const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window) @@ -205,20 +208,5 @@ void sve2_softmax_logits_1d_quantized( }, in_it, max_it, out_it); } - -template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - float beta, - bool is_log, - const Window &window); -template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - float beta, - bool is_log, - const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp deleted file mode 100644 index 95623786b3..0000000000 --- a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2021-2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/kernels/softmax/generic/sve2/impl.h" - -namespace arm_compute -{ -namespace cpu -{ -void sve2_qasymm8_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return sve2_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp deleted file mode 100644 index c20462fcef..0000000000 --- a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2021-2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/kernels/softmax/generic/sve2/impl.h" - -namespace arm_compute -{ -namespace cpu -{ -void sve2_qasymm8_signed_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/list.h b/src/cpu/kernels/softmax/list.h index 627ce0c264..c143f6659d 100644 --- a/src/cpu/kernels/softmax/list.h +++ b/src/cpu/kernels/softmax/list.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,41 +21,24 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H -#define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H +#ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H +#define ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H namespace arm_compute { namespace cpu { -#define DECLARE_SOFTMAX_KERNEL(func_name) \ - void func_name(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, \ - bool is_log, const Window &window) +#define DECLARE_SOFTMAX_KERNEL(func_name) \ + template <bool IS_LOG> \ + void func_name(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window) DECLARE_SOFTMAX_KERNEL(neon_fp32_softmax); DECLARE_SOFTMAX_KERNEL(neon_fp16_softmax); DECLARE_SOFTMAX_KERNEL(neon_qasymm8_softmax); DECLARE_SOFTMAX_KERNEL(neon_qasymm8_signed_softmax); -DECLARE_SOFTMAX_KERNEL(sve_fp32_softmax); -DECLARE_SOFTMAX_KERNEL(sve_fp16_softmax); -DECLARE_SOFTMAX_KERNEL(sve2_qasymm8_signed_softmax); -DECLARE_SOFTMAX_KERNEL(sve2_qasymm8_softmax); #undef DECLARE_SOFTMAX_KERNEL - -#define DECLARE_LOGITS_KERNEL(func_name) void func_name(const ITensor *in, ITensor *out, const Window &window) - -DECLARE_LOGITS_KERNEL(neon_fp32_logits); -DECLARE_LOGITS_KERNEL(neon_fp16_logits); -DECLARE_LOGITS_KERNEL(neon_qasymm8_logits); -DECLARE_LOGITS_KERNEL(neon_qasymm8_singed_logits); -DECLARE_LOGITS_KERNEL(sve_fp32_logits); -DECLARE_LOGITS_KERNEL(sve_fp16_logits); -DECLARE_LOGITS_KERNEL(sve_qasymm8_logits); -DECLARE_LOGITS_KERNEL(sve_qasymm8_signed_logits); - -#undef DECLARE_LOGITS_KERNEL } // namespace cpu } // namespace arm_compute -#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H */ +#endif // ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H |