diff options
27 files changed, 631 insertions, 1061 deletions
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox index 13f4e9ea2a..ac4f0610ea 100644 --- a/docs/user_guide/release_version_and_change_log.dox +++ b/docs/user_guide/release_version_and_change_log.dox @@ -46,6 +46,8 @@ v24.01 Public major release You should link only to the main `libarm_compute` library for core functionality. - New features - Add support for FP16 in all multi_isa builds. + - Performance optimizations: + - Optimize @ref NESoftmaxLayer v23.11 Public major release - New features @@ -438,8 +440,8 @@ v21.02 Public major release - @ref NEActivationLayer - @ref NEArithmeticAddition - @ref NEBatchNormalizationLayerKernel - - @ref cpu::kernels::CpuLogits1DSoftmaxKernel - - @ref cpu::kernels::CpuLogits1DMaxKernel + - cpu::kernels::CpuLogits1DSoftmaxKernel + - cpu::kernels::CpuLogits1DMaxKernel - @ref cpu::kernels::CpuElementwiseUnaryKernel - Remove padding from OpenCL kernels: - CLDirectConvolutionLayerKernel diff --git a/filelist.json b/filelist.json index c34eff2ff9..60f4285a03 100644 --- a/filelist.json +++ b/filelist.json @@ -2213,16 +2213,10 @@ "qasymm8_signed":["src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp"] }, "sve": { - "common": [ "src/cpu/kernels/softmax/generic/sve/impl.cpp" ], - "fp32": ["src/cpu/kernels/softmax/generic/sve/fp32.cpp"], - "fp16": ["src/cpu/kernels/softmax/generic/sve/fp16.cpp"], - "qasymm8": ["src/cpu/kernels/softmax/generic/sve/qasymm8.cpp" ], - "qasymm8_signed": ["src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp"] + "common": [ "src/cpu/kernels/softmax/generic/sve/impl.cpp" ] }, "sve2":{ - "common" :["src/cpu/kernels/softmax/generic/sve2/impl.cpp"], - "qasymm8":[ "src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp"], - "qasymm8_signed":["src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp"] + "common" :["src/cpu/kernels/softmax/generic/sve2/impl.cpp"] } } }, diff --git a/src/BUILD.bazel b/src/BUILD.bazel index f281b6a4d5..c14e10c836 100644 --- a/src/BUILD.bazel +++ b/src/BUILD.bazel @@ -117,9 +117,7 @@ filegroup( "cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp", "cpu/kernels/elementwise_unary/generic/sve2/q8.cpp", "cpu/kernels/lut/generic/sve2/u8.cpp", - "cpu/kernels/softmax/generic/sve2/impl.cpp", - "cpu/kernels/softmax/generic/sve2/qasymm8.cpp", - "cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp"] + + "cpu/kernels/softmax/generic/sve2/impl.cpp"] + glob(["**/*.h", "**/*.hpp", "**/*.inl"]), @@ -342,11 +340,7 @@ filegroup( "cpu/kernels/scale/sve/integer.cpp", "cpu/kernels/scale/sve/qasymm8.cpp", "cpu/kernels/scale/sve/qasymm8_signed.cpp", - "cpu/kernels/softmax/generic/sve/fp16.cpp", - "cpu/kernels/softmax/generic/sve/fp32.cpp", - "cpu/kernels/softmax/generic/sve/impl.cpp", - "cpu/kernels/softmax/generic/sve/qasymm8.cpp", - "cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp"] + + "cpu/kernels/softmax/generic/sve/impl.cpp"] + glob(["**/*.h", "**/*.hpp", "**/*.inl"]), diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3229ffa8c2..e6c6782da1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -317,11 +317,7 @@ target_sources( cpu/kernels/scale/sve/integer.cpp cpu/kernels/scale/sve/qasymm8.cpp cpu/kernels/scale/sve/qasymm8_signed.cpp - cpu/kernels/softmax/generic/sve/fp16.cpp - cpu/kernels/softmax/generic/sve/fp32.cpp cpu/kernels/softmax/generic/sve/impl.cpp - cpu/kernels/softmax/generic/sve/qasymm8.cpp - cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp ) target_sources( @@ -339,8 +335,6 @@ target_sources( cpu/kernels/elementwise_unary/generic/sve2/q8.cpp cpu/kernels/lut/generic/sve2/u8.cpp cpu/kernels/softmax/generic/sve2/impl.cpp - cpu/kernels/softmax/generic/sve2/qasymm8.cpp - cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp ) target_sources( diff --git a/src/core/NEON/wrapper/intrinsics/max.h b/src/core/NEON/wrapper/intrinsics/max.h index cec437d171..32d38a856c 100644 --- a/src/core/NEON/wrapper/intrinsics/max.h +++ b/src/core/NEON/wrapper/intrinsics/max.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2020, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_WRAPPER_MAX_H -#define ARM_COMPUTE_WRAPPER_MAX_H +#ifndef ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H +#define ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H #include <arm_neon.h> @@ -59,6 +59,39 @@ VMAX_IMPL(float16_t, float16x8_t, vmaxq, f16) #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #undef VMAX_IMPL + +#if defined(__aarch64__) +// VMAXV: Across vector max +#define VMAXV_IMPL(stype, vtype, prefix, postfix) \ + inline stype vmaxv(const vtype &a) \ + { \ + return prefix##_##postfix(a); \ + } + +VMAXV_IMPL(uint8_t, uint8x8_t, vmaxv, u8) +VMAXV_IMPL(int8_t, int8x8_t, vmaxv, s8) +VMAXV_IMPL(uint16_t, uint16x4_t, vmaxv, u16) +VMAXV_IMPL(int16_t, int16x4_t, vmaxv, s16) +VMAXV_IMPL(uint32_t, uint32x2_t, vmaxv, u32) +VMAXV_IMPL(int32_t, int32x2_t, vmaxv, s32) +VMAXV_IMPL(float, float32x2_t, vmaxv, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VMAXV_IMPL(float16_t, float16x4_t, vmaxv, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +VMAXV_IMPL(uint8_t, uint8x16_t, vmaxvq, u8) +VMAXV_IMPL(int8_t, int8x16_t, vmaxvq, s8) +VMAXV_IMPL(uint16_t, uint16x8_t, vmaxvq, u16) +VMAXV_IMPL(int16_t, int16x8_t, vmaxvq, s16) +VMAXV_IMPL(uint32_t, uint32x4_t, vmaxvq, u32) +VMAXV_IMPL(int32_t, int32x4_t, vmaxvq, s32) +VMAXV_IMPL(float, float32x4_t, vmaxvq, f32) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +VMAXV_IMPL(float16_t, float16x8_t, vmaxvq, f16) +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#undef VMAXV_IMPL +#endif // defined(__aarch64__) } // namespace wrapper } // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_MAX_H */ +#endif // ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h index b7daa4d583..45ebeec394 100644 --- a/src/cpu/kernels/CpuKernelSelectionTypes.h +++ b/src/cpu/kernels/CpuKernelSelectionTypes.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H -#define ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H +#ifndef ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H +#define ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H #include "arm_compute/core/Types.h" @@ -99,6 +99,13 @@ struct ScaleKernelDataTypeISASelectorData InterpolationPolicy interpolation_policy; }; +struct SoftmaxKernelDataTypeISASelectorData +{ + DataType dt; + cpuinfo::CpuIsaInfo isa; + bool is_log; +}; + // Selector pointer types using DataTypeISASelectorPtr = std::add_pointer<bool(const DataTypeISASelectorData &data)>::type; using DataTypeDataLayoutSelectorPtr = std::add_pointer<bool(const DataTypeDataLayoutISASelectorData &data)>::type; @@ -113,9 +120,10 @@ using CpuAddKernelDataTypeISASelectorDataPtr = std::add_pointer<bool(const CpuAddKernelDataTypeISASelectorData &data)>::type; using ScaleKernelDataTypeISASelectorDataPtr = std::add_pointer<bool(const ScaleKernelDataTypeISASelectorData &data)>::type; - +using SoftmaxKernelDataTypeISASelectorDataPtr = + std::add_pointer<bool(const SoftmaxKernelDataTypeISASelectorData &data)>::type; } // namespace kernels } // namespace cpu } // namespace arm_compute -#endif // ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H +#endif // ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp index ce144351f8..486f55e2c1 100644 --- a/src/cpu/kernels/CpuSoftmaxKernel.cpp +++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp @@ -34,9 +34,12 @@ #include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/Utils.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/softmax/list.h" +#include <vector> + namespace arm_compute { namespace cpu @@ -45,136 +48,40 @@ namespace kernels { namespace { -/* Softmax Logits 1D Max - identifying the max value of 1D Logits */ -static const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> available_kernels_max_logits = { - {"sve_fp32_logits_1d_max", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; }, - REGISTER_FP32_SVE(sve_fp32_logits)}, - {"sve_fp16_logits_1d_max", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; }, - REGISTER_FP16_SVE(sve_fp16_logits)}, - {"sve_qu8_logits_1d_max", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; }, - REGISTER_QASYMM8_SVE(sve_qasymm8_logits)}, - {"sve_qs8_logits_1d_max", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; }, - REGISTER_QASYMM8_SIGNED_SVE(sve_qasymm8_signed_logits)}, - {"neon_fp32_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(neon_fp32_logits)}, - {"neon_fp16_logits_1d_max", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, - REGISTER_FP16_NEON(neon_fp16_logits)}, - {"neon_qu8_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(neon_qasymm8_logits)}, - {"neon_qs8_logits_1d_max", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_singed_logits)}, +/* Softmax */ +static const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> available_kernels = { + {"neon_fp32_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::F32); }, + REGISTER_FP32_NEON(neon_fp32_softmax<false>)}, + {"neon_fp16_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) + { return (!data.is_log && data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_softmax<false>)}, + {"neon_qu8_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax<false>)}, + {"neon_qs8_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) + { return (!data.is_log && data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax<false>)}, + {"neon_fp32_log_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) { return (data.is_log && data.dt == DataType::F32); }, + REGISTER_FP32_NEON(neon_fp32_softmax<true>)}, + {"neon_fp16_log_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) + { return (data.is_log && data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_softmax<true>)}, + {"neon_qu8_log_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) { return (data.is_log && data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax<true>)}, + {"neon_qs8_log_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) + { return (data.is_log && data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax<true>)}, }; -Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::F16, DataType::F32); - - // Validate in case of configured output - if (output.total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), - TensorShape(input.tensor_shape()).set(0, 1)); - } - - return Status{}; -} -} //namespace -const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> &CpuLogits1DMaxKernel::get_available_kernels() -{ - return available_kernels_max_logits; -} - -void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst)); - - // Softmax across the x dimension - const TensorShape output_shape = TensorShape(src->tensor_shape()).set(0, 1); - // Output auto initialization if not yet initialized - auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info()); - - const auto *uk = get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); - ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - - _run_method = uk->ukernel; - _name = std::string("CpuLogits1DMaxKernel").append("/").append(uk->name); - - Window win = calculate_max_window(*src, Steps()); - ICpuKernel::configure(win); -} - -Status CpuLogits1DMaxKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*src, *dst)); - - return Status{}; -} - -void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_run_method == nullptr); - - const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - _run_method(src, dst, window); -} - -const char *CpuLogits1DMaxKernel::name() const -{ - return _name.c_str(); -} - -/* Softmax Logits 1D - computation for QASYMM8 with pre-computed max. */ -template <bool IS_LOG> -static const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> available_kernels_logits = { - {"sve2_qu8_softmax_logits_1d", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; }, - REGISTER_QASYMM8_SVE2(sve2_qasymm8_softmax)}, - {"sve2_qs8_softmax_logits_1d", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; }, - REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_softmax)}, - {"sve_fp32_softmax_logits_1d", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; }, - REGISTER_FP32_SVE(sve_fp32_softmax)}, - {"sve_fp16_softmax_logits_1d", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; }, - REGISTER_FP16_SVE(sve_fp16_softmax)}, - - {"neon_fp32_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(neon_fp32_softmax)}, - {"neon_fp16_softmax_logits_1d", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, - REGISTER_FP16_NEON(neon_fp16_softmax)}, - {"neon_qu8_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)}, - {"neon_qs8_softmax_logits_1d", - [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)}, -}; -namespace -{ -Status validate_arguments_logits_softmax(const ITensorInfo &src, - const ITensorInfo &max, - const ITensorInfo &dst, - const float beta, - const ITensorInfo &tmp, - bool is_log) +Status validate_arguments_softmax( + const ITensorInfo &src, const ITensorInfo &dst, float beta, const ITensorInfo &tmp, bool is_log) { ARM_COMPUTE_UNUSED(beta); // Check input @@ -184,11 +91,6 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type()); - // Check max - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(src.tensor_shape()).set(0, 1), max.tensor_shape()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max); - // Check output if configured if (dst.total_size() != 0) { @@ -203,8 +105,11 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, // Check tmp if configured if (tmp.total_size() != 0) { - const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type(); - ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type); + // We have temporary storage only if src data type is quantized. + // Therefore, tmp data type must be F32 + ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(!is_quantized_asymmetric); + // We could potentially reduce tmp memory if we could predict or make an assumption // on the maximum number of threads that will run in parallel. ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &tmp); @@ -214,91 +119,97 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, } } // namespace -template <bool IS_LOG> -const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> & -CpuLogits1DSoftmaxKernel<IS_LOG>::get_available_kernels() +const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> &CpuSoftmaxKernel::get_available_kernels() { - return available_kernels_logits<IS_LOG>; + return available_kernels; } -template <bool IS_LOG> -void CpuLogits1DSoftmaxKernel<IS_LOG>::configure( - const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp) +void CpuSoftmaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, bool is_log, ITensorInfo *tmp) { - ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, tmp); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_softmax(*src, *dst, beta, *tmp, is_log)); // Configure kernel window const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type()); // Output auto initialization if not yet initialized const QuantizationInfo output_quantization = - is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) + is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), is_log) : dst->quantization_info(); auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding()); - // Tmp auto initialization if not yet initialized - const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type(); - auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding()); + // Tmp auto initialization if not yet initialized and src is quantized + if (is_quantized_asymmetric) + { + const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type(); + auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding()); + } - const auto *uk = CpuLogits1DSoftmaxKernel<IS_LOG>::get_implementation( - DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); + const auto *uk = CpuSoftmaxKernel::get_implementation( + SoftmaxKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), is_log}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - std::string kernel_name = - IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel"); + std::string kernel_name = is_log ? std::string("CpuLogSoftmaxKernel") : std::string("CpuSoftmaxKernel"); _beta = beta; _run_method = uk->ukernel; _name = kernel_name.append("/").append(uk->name); - // Configure kernel window - Window win = calculate_max_window(*max, Steps()); + Window win = calculate_max_window(*dst, Steps()); + + /// TODO: Check dimensions > 0 for holes only. For this, we need + /// a utility function checking if there are holes after some dimension. + if (!has_holes(*dst, dst->num_dimensions() - 1)) + { + win = win.collapse(win, Window::DimY); + } - ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>::configure(win); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); // First dimension is the reduction axis + + ICpuKernel<CpuSoftmaxKernel>::configure(win); } -template <bool IS_LOG> -Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate( - const ITensorInfo *src, const ITensorInfo *max, const ITensorInfo *dst, const float beta, const ITensorInfo *tmp) +Status CpuSoftmaxKernel::validate( + const ITensorInfo *src, const ITensorInfo *dst, float beta, bool is_log, const ITensorInfo *tmp) { - ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, tmp); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_softmax(*src, *dst, beta, *tmp, is_log)); return Status{}; } -template <bool IS_LOG> -void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +void CpuSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { - ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>::window(), window); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel<CpuSoftmaxKernel>::window(), window); ARM_COMPUTE_ERROR_ON(_run_method == nullptr); const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto max = tensors.get_tensor(TensorType::ACL_SRC_1); auto dst = tensors.get_tensor(TensorType::ACL_DST_0); - auto tmp = tensors.get_tensor(TensorType::ACL_DST_1); - const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x(); - const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration; + if (is_data_type_quantized_asymmetric(src->info()->data_type())) + { + auto tmp = tensors.get_tensor(TensorType::ACL_DST_1); + + const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x(); + const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration; - ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread)); + ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread)); - void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread); - _run_method(src, max, tmp_for_thread, dst, _beta, IS_LOG, window); + void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread); + _run_method(src, tmp_for_thread, dst, _beta, window); + } + else + { + _run_method(src, nullptr, dst, _beta, window); + } } -template <bool IS_LOG> -const char *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const +const char *CpuSoftmaxKernel::name() const { return _name.c_str(); } -template class CpuLogits1DSoftmaxKernel<true>; -template class CpuLogits1DSoftmaxKernel<false>; - } // namespace kernels } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h index 5d288179fd..3db1f3d0ef 100644 --- a/src/cpu/kernels/CpuSoftmaxKernel.h +++ b/src/cpu/kernels/CpuSoftmaxKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H -#define ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H +#ifndef ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H +#define ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -33,102 +33,55 @@ namespace cpu { namespace kernels { -/** Interface for the identifying the max value of 1D Logits */ -class CpuLogits1DMaxKernel : public ICpuKernel<CpuLogits1DMaxKernel> +/** Interface for softmax computation */ +class CpuSoftmaxKernel : public ICpuKernel<CpuSoftmaxKernel> { private: - using SoftmaxLogits1DMaxKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &)>::type; + using SoftmaxKernelPtr = + std::add_pointer<void(const ITensor *, void *const, ITensor *, float, const Window &)>::type; public: - CpuLogits1DMaxKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DMaxKernel); - /** Set the input and output tensors. - * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[out] dst Destination tensor info. Data types supported: same as @p input - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuLogits1DMaxKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - - struct SoftmaxLogits1DMaxKernel - { - const char *name; - const DataTypeISASelectorPtr is_selected; - SoftmaxLogits1DMaxKernelPtr ukernel; - }; - - static const std::vector<SoftmaxLogits1DMaxKernel> &get_available_kernels(); - -private: - SoftmaxLogits1DMaxKernelPtr _run_method{nullptr}; - std::string _name{}; -}; - -/** Interface for softmax computation for QASYMM8 with pre-computed max. */ -template <bool IS_LOG = false> -class CpuLogits1DSoftmaxKernel : public ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>> -{ -private: - using SoftmaxLogits1DKernelPtr = std::add_pointer<void( - const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type; - -public: - CpuLogits1DSoftmaxKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DSoftmaxKernel); + CpuSoftmaxKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuSoftmaxKernel); /** Set the input and output tensors. * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] max Max values tensor info. Same shape as input with dimension 0 set to 1. - * Data types supported: same as @p input. - * @param[out] dst Destination tensor info. Data types supported: same as @p input. - * @param[in] beta A scaling factor for the exponent. + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p input. + * @param[in] beta A scaling factor for the exponent. + * @param[in] is_log True if the operation is log-softmax * * @param tmp Auxiliary tensor info. Must be type F32 and same shape as the input. */ - void - configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp); + void configure(const ITensorInfo *src, ITensorInfo *dst, float beta, bool is_log, ITensorInfo *tmp); /** Static function to check if given info will lead to a valid configuration * - * Similar to CpuLogits1DSoftmaxKernel::configure() + * Similar to CpuSoftmaxKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, - const ITensorInfo *max, - const ITensorInfo *dst, - const float beta, - const ITensorInfo *tmp); + static Status + validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, bool is_log, const ITensorInfo *tmp); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; - struct SoftmaxLogits1DKernel + struct SoftmaxKernel { - const char *name; - const DataTypeISASelectorPtr is_selected; - SoftmaxLogits1DKernelPtr ukernel; + const char *name; + const SoftmaxKernelDataTypeISASelectorDataPtr is_selected; + SoftmaxKernelPtr ukernel; }; - static const std::vector<SoftmaxLogits1DKernel> &get_available_kernels(); + static const std::vector<SoftmaxKernel> &get_available_kernels(); private: - float _beta{1.0f}; - SoftmaxLogits1DKernelPtr _run_method{nullptr}; - std::string _name{}; + float _beta{1.0f}; + SoftmaxKernelPtr _run_method{nullptr}; + std::string _name{}; }; } // namespace kernels } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H */ +#endif // ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp index 2e2adf33e0..db8f881712 100644 --- a/src/cpu/kernels/softmax/generic/neon/fp16.cpp +++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp @@ -31,21 +31,18 @@ namespace arm_compute { namespace cpu { -void neon_fp16_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return neon_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window); -} -void neon_fp16_logits(const ITensor *in, ITensor *out, const Window &window) +template <bool IS_LOG> +void neon_fp16_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window) { - return neon_logits_1d_max<float16_t>(in, out, window); + return neon_softmax_float<float16_t, IS_LOG>(in, tmp, out, beta, window); } + +template void +neon_fp16_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); +template void +neon_fp16_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); + } // namespace cpu } // namespace arm_compute #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp index 61df40c1b5..c281d1bf31 100644 --- a/src/cpu/kernels/softmax/generic/neon/fp32.cpp +++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,20 +29,17 @@ namespace arm_compute { namespace cpu { -void neon_fp32_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return neon_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window); -} -void neon_fp32_logits(const ITensor *in, ITensor *out, const Window &window) +template <bool IS_LOG> +void neon_fp32_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window) { - return neon_logits_1d_max<float>(in, out, window); + return neon_softmax_float<float, IS_LOG>(in, tmp, out, beta, window); } + +template void +neon_fp32_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); +template void +neon_fp32_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); + } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp index 5d6e6a4f80..487f6ae051 100644 --- a/src/cpu/kernels/softmax/generic/neon/impl.cpp +++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp @@ -29,43 +29,76 @@ namespace arm_compute { namespace cpu { -template void neon_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window); -template void neon_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window); - -template <typename T> -void neon_softmax_logits_1d_quantized( - const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window) +template <typename T, bool IS_LOG> +void neon_softmax_quantized(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window) { static_assert(std::is_same<T, qasymm8_t>::value || std::is_same<T, qasymm8_signed_t>::value, "quantized type should be either qasymm8_t or qasymm8_signed_t."); - const int start_x = in->info()->valid_region().anchor.x(); const int input_width = in->info()->valid_region().shape.x(); - const float scale_beta = -beta * in->info()->quantization_info().uniform().scale; - const auto scale_beta_vec = vdupq_n_f32(scale_beta); + const float scale_beta = -beta * in->info()->quantization_info().uniform().scale; + const float32x4_t scale_beta_vec = vdupq_n_f32(scale_beta); + + Iterator in_it(in, window); + Iterator out_it(out, window); - Iterator in_it(in, window); - Iterator max_it(max, window); - Iterator out_it(out, window); constexpr int vec_size = 16; +#ifndef __aarch64__ + const int sum_stages = log2(vec_size >> 1); +#endif // __aarch64__ + + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + execute_window_loop( window, [&](const Coordinates &) { /* Get pointers */ - const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast<float *>(tmp); + const T *in_ptr = reinterpret_cast<const T *>(in_it.ptr()); + T *out_ptr = reinterpret_cast<T *>(out_it.ptr()); + float *tmp_ptr = reinterpret_cast<float *>(tmp); + + T max_val; + + /* Compute Max */ + { + // Init max value + auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{}); + int x = 0; - float sum{}; - float sum_inversed{}; + for (; x <= (input_width - vec_size); x += vec_size) + { + const auto current_value = wrapper::vloadq(in_ptr + x); + vec_max = wrapper::vmax(vec_max, current_value); + } + +#ifdef __aarch64__ + max_val = wrapper::vmaxv(vec_max); +#else // __aarch64__ + auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); + + for (int i = 0; i < sum_stages; ++i) + { + carry_max = wrapper::vpmax(carry_max, carry_max); + } + + max_val = wrapper::vgetlane(carry_max, 0); +#endif // __aarch64__ + + // Compute left-over elements + for (; x < input_width; ++x) + { + max_val = std::max(*(in_ptr + x), max_val); + } + } // Compute Max + + float sum_transformed{}; /* Compute exponentials and sum */ { /* Get max value */ - const auto max_val = *reinterpret_cast<const T *>(max_it.ptr()); const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{}); /* Init sum to zero */ @@ -80,11 +113,11 @@ void neon_softmax_logits_1d_quantized( int x = 0; for (; x <= (input_width - vec_size); x += vec_size) { - auto vec_elements = wrapper::vloadq(in_ptr + x); - vec_elements = wrapper::vqsub(vec_max, vec_elements); - auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements); + auto vec_elements = wrapper::vloadq(in_ptr + x); + vec_elements = wrapper::vqsub(vec_max, vec_elements); + float32x4x4_t vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements); - if (is_log) + if (IS_LOG) { vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec); vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec); @@ -111,17 +144,24 @@ void neon_softmax_logits_1d_quantized( } /* Reduce sum */ - const auto sum_16_byte = + const float32x4_t sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3])); + + float sum; + +#ifdef __aarch64__ + sum = wrapper::vaddv(sum_16_byte); +#else // __aarch64__ auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte)); sum_res = vpadd_f32(sum_res, sum_res); sum = wrapper::vgetlane(sum_res, 0); +#endif // __aarch64__ /* Run remaining elements */ for (; x < input_width; ++x) { float element{}; - if (is_log) + if (IS_LOG) { element = (max_val - in_ptr[x]) * scale_beta; sum += std::exp(element); @@ -135,19 +175,22 @@ void neon_softmax_logits_1d_quantized( tmp_ptr[x] = element; } - if (!is_log) + if (!IS_LOG) { - sum_inversed = 256.f / sum; + sum_transformed = 256.f / sum; } else { - sum = std::log(sum); + sum_transformed = std::log(sum); } - } + } // Compute exponentials and sum /* Normalize exponentials */ { constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value; + + const float32x4_t sum_vec = vdupq_n_f32(sum_transformed); + /* Loop over row and compute softmax */ int x = 0; for (; x <= (input_width - vec_size); x += vec_size) @@ -155,23 +198,23 @@ void neon_softmax_logits_1d_quantized( using int_vec_type = wrapper::traits::neon_vector_t<T, 16>; float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x); int_vec_type normalized_value{}; - if (is_log) + if (IS_LOG) { const float32x4x4_t sub = { - vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[0], sum_vec), + vsubq_f32(vec_in.val[1], sum_vec), + vsubq_f32(vec_in.val[2], sum_vec), + vsubq_f32(vec_in.val[3], sum_vec), }; normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub); } else { float32x4x4_t mul = { - vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[0], sum_vec), + vmulq_f32(vec_in.val[1], sum_vec), + vmulq_f32(vec_in.val[2], sum_vec), + vmulq_f32(vec_in.val[3], sum_vec), }; if (is_qasymm8_signed) @@ -190,34 +233,31 @@ void neon_softmax_logits_1d_quantized( /* Run remaining elements */ for (; x < input_width; ++x) { - if (is_log) + if (IS_LOG) { - out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum); + out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum_transformed); } else { - out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - + out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_transformed) - (is_qasymm8_signed ? 128.f : 0)); } } - } + } // Normalize exponentials }, - in_it, max_it, out_it); + in_it, out_it); } -template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - float beta, - bool is_log, - const Window &window); -template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - float beta, - bool is_log, - const Window &window); +template void neon_softmax_quantized<qasymm8_signed_t, true>( + const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window); + +template void neon_softmax_quantized<qasymm8_signed_t, false>( + const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window); + +template void neon_softmax_quantized<qasymm8_t, true>( + const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window); + +template void neon_softmax_quantized<qasymm8_t, false>( + const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/impl.h b/src/cpu/kernels/softmax/generic/neon/impl.h index 4d9b789297..60380cd233 100644 --- a/src/cpu/kernels/softmax/generic/neon/impl.h +++ b/src/cpu/kernels/softmax/generic/neon/impl.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H -#define SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H +#ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H +#define ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H #include "arm_compute/core/Helpers.h" @@ -33,105 +33,100 @@ namespace arm_compute { namespace cpu { -template <typename T> -void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) -{ - /** SIMD vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; - - constexpr int window_step_x = 16 / sizeof(T); - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - - Window win{window}; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator input(in, win); - Iterator output(out, win); - - const int sum_stages = log2(window_step_x / 2); - execute_window_loop( - win, - [&](const Coordinates &) - { - // Get pointers - const auto in_ptr = reinterpret_cast<const T *>(input.ptr()); - const auto out_ptr = reinterpret_cast<T *>(output.ptr()); - - // Init max value - auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{}); - int x = window_start_x; - - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto current_value = wrapper::vloadq(in_ptr + x); - vec_max = wrapper::vmax(vec_max, current_value); - } - auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); - - for (int i = 0; i < sum_stages; ++i) - { - carry_max = wrapper::vpmax(carry_max, carry_max); - } - T max_val = wrapper::vgetlane(carry_max, 0); - // Compute left-over elements - for (; x < window_end_x; ++x) - { - max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val; - } +#ifdef __aarch64__ +namespace +{ +// These helper functions are added because vaddv does not exist for fp16, +// and, therefore, is not part of the wrapper::vaddv interface. +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +inline float16_t wrapper_vaddv(const float16x8_t &a, int sum_stages) +{ + auto sum_res = wrapper::vpadd(wrapper::vgethigh(a), wrapper::vgetlow(a)); + for (int i = 0; i < sum_stages; ++i) + { + sum_res = wrapper::vpadd(sum_res, sum_res); + } + return wrapper::vgetlane(sum_res, 0); +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - *out_ptr = max_val; - }, - input, output); +inline float wrapper_vaddv(const float32x4_t &a, int sum_stages) +{ + ARM_COMPUTE_UNUSED(sum_stages); + return wrapper::vaddv(a); } +} // namespace +#endif // __aarch64__ -template <typename T> -void neon_softmax_logits_1d_quantized(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - float beta, - bool is_log, - const Window &window); - -template <typename T> -void neon_softmax_logits_1d_float(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) +// The template implementation for float data types is stored in the header file because +// we need all fp16 instantiated code to live in fp16.cpp files. +template <typename T, bool IS_LOG> +void neon_softmax_float(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window) { - const int start_x = in->info()->valid_region().anchor.x(); + ARM_COMPUTE_UNUSED(tmp); + const int input_width = in->info()->valid_region().shape.x(); Iterator in_it(in, window); - Iterator max_it(max, window); Iterator out_it(out, window); /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; - constexpr int vec_size = 16 / sizeof(T); - const int sum_stages = log2(vec_size / 2); + constexpr int vec_size = 16 / sizeof(T); + + const int sum_stages = log2(vec_size >> 1); + + const auto beta_vec = wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}); execute_window_loop( window, [&](const Coordinates &) { /* Get pointers */ - const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast<T *>(tmp); + const T *in_ptr = reinterpret_cast<const T *>(in_it.ptr()); + T *out_ptr = reinterpret_cast<T *>(out_it.ptr()); + + T max_val; + + /* Compute Max */ + { + // Init max value + auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{}); + int x = 0; + + for (; x <= (input_width - vec_size); x += vec_size) + { + const auto current_value = wrapper::vloadq(in_ptr + x); + vec_max = wrapper::vmax(vec_max, current_value); + } + +#ifdef __aarch64__ + max_val = wrapper::vmaxv(vec_max); +#else // __aarch64__ + auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); + + for (int i = 0; i < sum_stages; ++i) + { + carry_max = wrapper::vpmax(carry_max, carry_max); + } + + max_val = wrapper::vgetlane(carry_max, 0); +#endif // __aarch64__ - T sum{}; - T sum_inversed{}; + // Compute left-over elements + for (; x < input_width; ++x) + { + max_val = std::max(*(in_ptr + x), max_val); + } + } // compute max + + T sum_transformed{}; /* Compute exponentials and sum */ { /* Get max value */ - const auto max_val = *reinterpret_cast<const T *>(max_it.ptr()); const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{}); /* Init sum to zero */ @@ -143,35 +138,38 @@ void neon_softmax_logits_1d_float(const ITensor *in, { auto vec_elements = wrapper::vloadq(in_ptr + x); vec_elements = wrapper::vsub(vec_elements, vec_max); - if (is_log) + if (IS_LOG) { - vec_elements = - wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})); - vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); + vec_elements = wrapper::vmul(vec_elements, beta_vec); + vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); } else { - vec_elements = wrapper::vexpq( - wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}))); - vec_sum = wrapper::vadd(vec_sum, vec_elements); + vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, beta_vec)); + vec_sum = wrapper::vadd(vec_sum, vec_elements); } - wrapper::vstore(tmp_ptr + x, vec_elements); + wrapper::vstore(out_ptr + x, vec_elements); } /* Reduce sum */ + T sum{}; +#ifdef __aarch64__ + sum = wrapper_vaddv(vec_sum, sum_stages); +#else // __aarch64__ auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum)); for (int i = 0; i < sum_stages; ++i) { sum_res = wrapper::vpadd(sum_res, sum_res); } sum = wrapper::vgetlane(sum_res, 0); +#endif // __aarch64__ /* Run remaining elements */ for (; x < input_width; ++x) { T element{}; - if (is_log) + if (IS_LOG) { element = (in_ptr[x] - max_val) * beta; sum += std::exp(element); @@ -181,55 +179,59 @@ void neon_softmax_logits_1d_float(const ITensor *in, element = std::exp((in_ptr[x] - max_val) * beta); sum += element; } - tmp_ptr[x] = element; + + out_ptr[x] = element; } - if (!is_log) + if (!IS_LOG) { - sum_inversed = T(1) / sum; + sum_transformed = T(1) / sum; } else { - sum = static_cast<T>(std::log(sum)); + sum_transformed = static_cast<T>(std::log(sum)); } - } + } // Compute exponentials and sum /* Normalize exponentials */ { + const auto sum_vec = wrapper::vdup_n(static_cast<T>(sum_transformed), ExactTagType{}); + /* Loop over row and compute softmax */ int x = 0; for (; x <= (input_width - vec_size); x += vec_size) { - auto vec_in = wrapper::vloadq(tmp_ptr + x); - auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); - if (is_log) + const auto vec_in = wrapper::vloadq(out_ptr + x); + if (IS_LOG) { - normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{})); + wrapper::vstore(out_ptr + x, wrapper::vsub(vec_in, sum_vec)); } else { - normalized_value = - wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{})); + wrapper::vstore(out_ptr + x, wrapper::vmul(vec_in, sum_vec)); } - wrapper::vstore(out_ptr + x, normalized_value); } + /* Run remaining elements */ for (; x < input_width; ++x) { - if (is_log) + if (IS_LOG) { - out_ptr[x] = tmp_ptr[x] - sum; + out_ptr[x] = out_ptr[x] - sum_transformed; } else { - out_ptr[x] = tmp_ptr[x] * sum_inversed; + out_ptr[x] = out_ptr[x] * sum_transformed; } } - } + } // Normalize exponentials }, - in_it, max_it, out_it); + in_it, out_it); } + +template <typename T, bool IS_LOG> +void neon_softmax_quantized(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window); } // namespace cpu } // namespace arm_compute -#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H */ +#endif // ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp index 40713dc496..9589ebcd7c 100644 --- a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,20 +29,16 @@ namespace arm_compute { namespace cpu { -void neon_qasymm8_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) +template <bool IS_LOG> +void neon_qasymm8_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window) { - return neon_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window); + return neon_softmax_quantized<qasymm8_t, IS_LOG>(in, tmp, out, beta, window); } -void neon_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return neon_logits_1d_max<qasymm8_t>(in, out, window); -} +template void +neon_qasymm8_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); +template void +neon_qasymm8_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); + } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp index 2c5e284f54..0bf6b2859a 100644 --- a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,20 +29,17 @@ namespace arm_compute { namespace cpu { -void neon_qasymm8_signed_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) +template <bool IS_LOG> +void neon_qasymm8_signed_softmax( + const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window) { - return neon_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window); + return neon_softmax_quantized<qasymm8_signed_t, IS_LOG>(in, tmp, out, beta, window); } -void neon_qasymm8_singed_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return neon_logits_1d_max<qasymm8_signed_t>(in, out, window); -} +template void neon_qasymm8_signed_softmax<true>( + const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); +template void neon_qasymm8_signed_softmax<false>( + const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window); + } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/fp16.cpp b/src/cpu/kernels/softmax/generic/sve/fp16.cpp deleted file mode 100644 index 5e94f72faf..0000000000 --- a/src/cpu/kernels/softmax/generic/sve/fp16.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2021-2023 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/CpuTypes.h" -#include "src/cpu/kernels/softmax/generic/sve/impl.h" -namespace arm_compute -{ -namespace cpu -{ -void sve_fp16_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return sve_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window); -} - -void sve_fp16_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return sve_logits_1d_max<float16_t>(in, out, window); -} -} // namespace cpu -} // namespace arm_compute -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/softmax/generic/sve/fp32.cpp b/src/cpu/kernels/softmax/generic/sve/fp32.cpp deleted file mode 100644 index d692cc2477..0000000000 --- a/src/cpu/kernels/softmax/generic/sve/fp32.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2021-2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/kernels/softmax/generic/sve/impl.h" - -namespace arm_compute -{ -namespace cpu -{ -void sve_fp32_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return sve_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window); -} - -void sve_fp32_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return sve_logits_1d_max<float>(in, out, window); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/impl.cpp b/src/cpu/kernels/softmax/generic/sve/impl.cpp index 24f1bb8143..0d4b7f4509 100644 --- a/src/cpu/kernels/softmax/generic/sve/impl.cpp +++ b/src/cpu/kernels/softmax/generic/sve/impl.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,6 +30,9 @@ namespace arm_compute { namespace cpu { +/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to +/// a single kernel that performs softmax operation. Leaving the SVE code here for +/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500 template <typename ScalarType> void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) { @@ -172,25 +175,5 @@ void sve_softmax_logits_1d_float(const ITensor *in, }, in_it, max_it, out_it); } - -template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window); -template void sve_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, const Window &window); -template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window); -template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window); - -template void sve_softmax_logits_1d_float<float>(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window); -template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp deleted file mode 100644 index 85e5ccfea1..0000000000 --- a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2021-2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/kernels/softmax/generic/sve/impl.h" - -namespace arm_compute -{ -namespace cpu -{ -void sve_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return sve_logits_1d_max<qasymm8_t>(in, out, window); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp deleted file mode 100644 index 4be2e2eed6..0000000000 --- a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2021-2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/kernels/softmax/generic/sve/impl.h" - -namespace arm_compute -{ -namespace cpu -{ -void sve_qasymm8_signed_logits(const ITensor *in, ITensor *out, const Window &window) -{ - return sve_logits_1d_max<qasymm8_signed_t>(in, out, window); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.cpp b/src/cpu/kernels/softmax/generic/sve2/impl.cpp index 98b2f5117f..a8fb1d4adf 100644 --- a/src/cpu/kernels/softmax/generic/sve2/impl.cpp +++ b/src/cpu/kernels/softmax/generic/sve2/impl.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,6 +32,9 @@ namespace arm_compute { namespace cpu { +/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to +/// a single kernel that performs softmax operation. Leaving the SVE2 code here for +/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500 template <typename ScalarType> void sve2_softmax_logits_1d_quantized( const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window) @@ -205,20 +208,5 @@ void sve2_softmax_logits_1d_quantized( }, in_it, max_it, out_it); } - -template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - float beta, - bool is_log, - const Window &window); -template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - float beta, - bool is_log, - const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp deleted file mode 100644 index 95623786b3..0000000000 --- a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2021-2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/kernels/softmax/generic/sve2/impl.h" - -namespace arm_compute -{ -namespace cpu -{ -void sve2_qasymm8_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return sve2_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp deleted file mode 100644 index c20462fcef..0000000000 --- a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2021-2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/Helpers.h" - -#include "src/cpu/kernels/softmax/generic/sve2/impl.h" - -namespace arm_compute -{ -namespace cpu -{ -void sve2_qasymm8_signed_softmax(const ITensor *in, - const ITensor *max, - void *const tmp, - ITensor *out, - const float beta, - bool is_log, - const Window &window) -{ - return sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/list.h b/src/cpu/kernels/softmax/list.h index 627ce0c264..c143f6659d 100644 --- a/src/cpu/kernels/softmax/list.h +++ b/src/cpu/kernels/softmax/list.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,41 +21,24 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H -#define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H +#ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H +#define ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H namespace arm_compute { namespace cpu { -#define DECLARE_SOFTMAX_KERNEL(func_name) \ - void func_name(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, \ - bool is_log, const Window &window) +#define DECLARE_SOFTMAX_KERNEL(func_name) \ + template <bool IS_LOG> \ + void func_name(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window) DECLARE_SOFTMAX_KERNEL(neon_fp32_softmax); DECLARE_SOFTMAX_KERNEL(neon_fp16_softmax); DECLARE_SOFTMAX_KERNEL(neon_qasymm8_softmax); DECLARE_SOFTMAX_KERNEL(neon_qasymm8_signed_softmax); -DECLARE_SOFTMAX_KERNEL(sve_fp32_softmax); -DECLARE_SOFTMAX_KERNEL(sve_fp16_softmax); -DECLARE_SOFTMAX_KERNEL(sve2_qasymm8_signed_softmax); -DECLARE_SOFTMAX_KERNEL(sve2_qasymm8_softmax); #undef DECLARE_SOFTMAX_KERNEL - -#define DECLARE_LOGITS_KERNEL(func_name) void func_name(const ITensor *in, ITensor *out, const Window &window) - -DECLARE_LOGITS_KERNEL(neon_fp32_logits); -DECLARE_LOGITS_KERNEL(neon_fp16_logits); -DECLARE_LOGITS_KERNEL(neon_qasymm8_logits); -DECLARE_LOGITS_KERNEL(neon_qasymm8_singed_logits); -DECLARE_LOGITS_KERNEL(sve_fp32_logits); -DECLARE_LOGITS_KERNEL(sve_fp16_logits); -DECLARE_LOGITS_KERNEL(sve_qasymm8_logits); -DECLARE_LOGITS_KERNEL(sve_qasymm8_signed_logits); - -#undef DECLARE_LOGITS_KERNEL } // namespace cpu } // namespace arm_compute -#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H */ +#endif // ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H diff --git a/src/cpu/operators/CpuSoftmax.cpp b/src/cpu/operators/CpuSoftmax.cpp index e55d7f903e..ae14381ad9 100644 --- a/src/cpu/operators/CpuSoftmax.cpp +++ b/src/cpu/operators/CpuSoftmax.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -41,13 +41,10 @@ namespace arm_compute { namespace cpu { -template <bool IS_LOG> -CpuSoftmaxGeneric<IS_LOG>::CpuSoftmaxGeneric() +CpuSoftmaxGeneric::CpuSoftmaxGeneric() : _permute_input(), _permute_output(), - _max_kernel(), _softmax_kernel(), - _max(), _tmp(), _input_permuted(), _output_permuted(), @@ -56,8 +53,7 @@ CpuSoftmaxGeneric<IS_LOG>::CpuSoftmaxGeneric() { } -template <bool IS_LOG> -void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis) +void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis, bool is_log) { // Perform validation step ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -79,29 +75,23 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d // or it is the original input case (2D case) const ITensorInfo *tmp_input = (_needs_permute ? &_input_permuted : src); - // Create intermediate tensors shapes - TensorShape max_sum_shape = tmp_input->tensor_shape(); - max_sum_shape.set(0, 1); - const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true); - DataType tmp_data_type = - is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type(); - TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type)); - TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape)); + TensorInfo tensor_info_tmp; + if (is_data_type_quantized_asymmetric(src->data_type())) + { + // Create intermediate tensors shapes + const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true); + tensor_info_tmp = input_info.clone()->set_data_type(DataType::F32); + } // Init intermediate tensors - _max = TensorInfo(max_info); _tmp = TensorInfo(tensor_info_tmp); // Configure kernels - auto mk = std::make_unique<kernels::CpuLogits1DMaxKernel>(); - mk->configure(tmp_input, &_max); - _max_kernel = std::move(mk); - - auto sm = std::make_unique<kernels::CpuLogits1DSoftmaxKernel<IS_LOG>>(); + auto sm = std::make_unique<kernels::CpuSoftmaxKernel>(); if (_needs_permute) { // The normalization kernel stores the result in a permuted output tensor - sm->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp); + sm->configure(tmp_input, &_output_permuted, beta, is_log, &_tmp); // Re-permute the permuted output into the requested (4D) output _permute_output.configure(&_output_permuted, dst, @@ -110,14 +100,15 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d else { // Softmax 2D case - sm->configure(tmp_input, &_max, dst, beta, &_tmp); + sm->configure(tmp_input, dst, beta, is_log, &_tmp); } _softmax_kernel = std::move(sm); - _aux_mem[InternalTensorIdx::MAX] = - MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size()); - _aux_mem[InternalTensorIdx::TMP] = - MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size()); + if (_tmp.total_size() > 0) + { + _aux_mem[InternalTensorIdx::TMP] = + MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size()); + } _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _input_permuted.total_size()); @@ -125,8 +116,8 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d MemoryLifetime::Temporary, _output_permuted.total_size()); } -template <bool IS_LOG> -Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis) +Status +CpuSoftmaxGeneric::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis, bool is_log) { // Perform validation step ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); @@ -136,17 +127,12 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor static_cast<int32_t>(src->num_dimensions()) <= axis); // Create intermediate tensor info - DataType tmp_data_type = src->data_type(); - const TensorInfo tensor_info_tmp(src->clone()->set_data_type(tmp_data_type).set_is_resizable(true)); - - TensorShape max_sum_shape = src->tensor_shape(); - max_sum_shape.set(0, 1); - const TensorInfo tensor_info_max_sum(src->clone() - ->set_tensor_shape(max_sum_shape) - .set_data_type(tmp_data_type) - .set_quantization_info(src->quantization_info()) - .set_is_resizable(true)); - const TensorInfo dont_care; + TensorInfo tensor_info_tmp; + + if (is_data_type_quantized_asymmetric(src->data_type())) + { + tensor_info_tmp = src->clone()->set_data_type(DataType::F32).set_is_resizable(true); + } const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions()))); @@ -165,15 +151,12 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&output_permuted, dst, permutation_vector)); } - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DMaxKernel::validate(src, &tensor_info_max_sum)); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate( - &tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuSoftmaxKernel::validate(src, dst, beta, is_log, &tensor_info_tmp)); return Status{}; } -template <bool IS_LOG> -void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors) +void CpuSoftmaxGeneric::run(ITensorPack &tensors) { ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); @@ -181,13 +164,11 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors) auto dst = tensors.get_tensor(TensorType::ACL_DST); CpuAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp, tensors, true); - CpuAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max, tensors, true); CpuAuxTensorHandler input_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _input_permuted, tensors, true); CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors, true); - ITensorPack max_pack; ITensorPack softmax_pack; if (_needs_permute) @@ -195,24 +176,15 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors) ITensorPack permute_in_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, input_permuted.get()}}; _permute_input.run(permute_in_pack); - max_pack = {{TensorType::ACL_SRC, input_permuted.get()}, {TensorType::ACL_DST, max.get()}}; - softmax_pack = {{TensorType::ACL_SRC_0, input_permuted.get()}, - {TensorType::ACL_SRC_1, max.get()}, {TensorType::ACL_DST_0, output_permuted.get()}, {TensorType::ACL_DST_1, tmp.get()}}; } else { - max_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, max.get()}}; - - softmax_pack = {{TensorType::ACL_SRC_0, src}, - {TensorType::ACL_SRC_1, max.get()}, - {TensorType::ACL_DST_0, dst}, - {TensorType::ACL_DST_1, tmp.get()}}; + softmax_pack = {{TensorType::ACL_SRC_0, src}, {TensorType::ACL_DST_0, dst}, {TensorType::ACL_DST_1, tmp.get()}}; } - NEScheduler::get().schedule_op(_max_kernel.get(), Window::DimY, _max_kernel->window(), max_pack); NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack); if (_needs_permute) @@ -224,13 +196,10 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors) } } -template <bool IS_LOG> -experimental::MemoryRequirements CpuSoftmaxGeneric<IS_LOG>::workspace() const +experimental::MemoryRequirements CpuSoftmaxGeneric::workspace() const { return _aux_mem; } -template class CpuSoftmaxGeneric<false>; -template class CpuSoftmaxGeneric<true>; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h index 8cab70e14f..47020e9b7c 100644 --- a/src/cpu/operators/CpuSoftmax.h +++ b/src/cpu/operators/CpuSoftmax.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CPU_SOFTMAX_H -#define ARM_COMPUTE_CPU_SOFTMAX_H +#ifndef ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H +#define ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H #include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/TensorInfo.h" @@ -37,9 +37,7 @@ namespace arm_compute { namespace cpu { -class CpuLogits1DMaxKernel; -template <bool IS_LOG> -class CpuLogits1DSoftmaxKernel; +class CpuSoftmaxKernel; /** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer. * @@ -52,31 +50,31 @@ class CpuLogits1DSoftmaxKernel; * This function runs the following function/kernels: * -# If axis is not 0: * -# @ref CpuPermute - * -# @ref kernels::CpuLogits1DMaxKernel - * -# @ref kernels::CpuLogits1DSoftmaxKernel + * -# @ref kernels::CpuSoftmaxKernel */ -template <bool IS_LOG = false> class CpuSoftmaxGeneric : public ICpuOperator { public: CpuSoftmaxGeneric(); /** Set the input and output tensors. * - * @param[in,out] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * last value of each row to the nearest multiple. - * @param[out] dst Destination tensor ifo. Data types supported: same as @p input. - * @param[in] beta (Optional) A scaling factor for the exponent. - * @param[in] axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and + * @param[in,out] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * last value of each row to the nearest multiple. + * @param[out] dst Destination tensor ifo. Data types supported: same as @p input. + * @param[in] beta (Optional) A scaling factor for the exponent. + * @param[in] axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and * axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0 + * @param[in] is_log True if the operation is log-softmax */ - void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0); + void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuSoftmaxGeneric::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0); + static Status + validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false); // Inherited methods overridden: void run(ITensorPack &tensors) override; @@ -85,8 +83,7 @@ public: private: enum InternalTensorIdx { - MAX = 0, - TMP, + TMP = 0, PERMUTED_SRC, PERMUTED_DST, COUNT @@ -94,10 +91,8 @@ private: CpuPermute _permute_input; CpuPermute _permute_output; - std::unique_ptr<ICPPKernel> _max_kernel; std::unique_ptr<ICPPKernel> _softmax_kernel; - TensorInfo _max; TensorInfo _tmp; TensorInfo _input_permuted; TensorInfo _output_permuted; @@ -105,9 +100,7 @@ private: bool _needs_permute; experimental::MemoryRequirements _aux_mem{}; }; -using CpuSoftmax = CpuSoftmaxGeneric<false>; -using CpuLogSoftmax = CpuSoftmaxGeneric<true>; } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_SOFTMAX_H */ +#endif // ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp index e3c2012d05..be588c5b52 100644 --- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp +++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,7 +29,6 @@ #include "src/core/helpers/MemoryHelpers.h" #include "src/core/helpers/SoftmaxHelpers.h" -#include "src/cpu/kernels/CpuSoftmaxKernel.h" #include "src/cpu/operators/CpuSoftmax.h" namespace arm_compute @@ -37,13 +36,12 @@ namespace arm_compute template <bool IS_LOG> struct NESoftmaxLayerGeneric<IS_LOG>::Impl { - const ITensor *src{nullptr}; - ITensor *dst{nullptr}; - Tensor max{nullptr}; - std::unique_ptr<cpu::CpuSoftmaxGeneric<IS_LOG>> op{nullptr}; - MemoryGroup memory_group{}; - ITensorPack run_pack{}; - WorkspaceData<Tensor> workspace_tensors{}; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr<cpu::CpuSoftmaxGeneric> op{nullptr}; + MemoryGroup memory_group{}; + ITensorPack run_pack{}; + WorkspaceData<Tensor> workspace_tensors{}; }; template <bool IS_LOG> @@ -67,8 +65,8 @@ void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, f _impl->src = input; _impl->dst = output; - _impl->op = std::make_unique<cpu::CpuSoftmaxGeneric<IS_LOG>>(); - _impl->op->configure(input->info(), output->info(), beta, axis); + _impl->op = std::make_unique<cpu::CpuSoftmaxGeneric>(); + _impl->op->configure(input->info(), output->info(), beta, axis, IS_LOG); _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}}; _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); @@ -79,7 +77,7 @@ Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric<IS_LOG>::validate(input, output, beta, axis)); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric::validate(input, output, beta, axis, IS_LOG)); return Status{}; } diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp index b372bdf3fa..2397d81547 100644 --- a/tests/validation/NEON/SoftmaxLayer.cpp +++ b/tests/validation/NEON/SoftmaxLayer.cpp @@ -22,14 +22,12 @@ * SOFTWARE. */ #include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h" #include "arm_compute/runtime/Tensor.h" #include "arm_compute/runtime/TensorAllocator.h" #include "src/common/cpuinfo/CpuIsaInfo.h" #include "src/cpu/kernels/CpuSoftmaxKernel.h" #include "tests/NEON/Accessor.h" -#include "tests/PaddingCalculator.h" #include "tests/datasets/ShapeDatasets.h" #include "tests/framework/Asserts.h" #include "tests/framework/Macros.h" @@ -42,6 +40,7 @@ namespace test { namespace validation { +using framework::dataset::make; namespace { /** Tolerance for float operations */ @@ -53,7 +52,7 @@ constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1); constexpr AbsoluteTolerance<int8_t> tolerance_qasymm8_signed(1); /** CNN data types */ -const auto CNNDataTypes = framework::dataset::make("DataType", +const auto CNNDataTypes = make("DataType", { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC DataType::F16, @@ -66,53 +65,53 @@ TEST_SUITE(NEON) TEST_SUITE(SoftmaxLayer) // *INDENT-OFF* // clang-format off -DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip( - framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching data types - TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching shapes - TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info - QuantizationInfo(1.f/256, 12)), - TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), - TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, - QuantizationInfo(1.f/256, 12)), - TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), - TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, //Invalid axis high - QuantizationInfo(1.f/256, 12)), - TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, //Invalid axis low - QuantizationInfo(1.f/256, 12)), - }), - framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16), - TensorInfo(TensorShape(27U, 11U), 1, DataType::F32), - TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, - QuantizationInfo(1.f/256, 12)), - TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), - TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, - QuantizationInfo(1.f/256, 0)), - TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), - TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, - QuantizationInfo(1.f/256, 0)), - TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, - QuantizationInfo(1.f/256, 0)), - })), - framework::dataset::make("beta", { 1.0, - 2.0, - 1.0, - 2.0, - 1.0, - 1.0, - 2.0, - 1.0, - })), - framework::dataset::make("axis", { 0, - 0, - 0, - 1, - 0, - -1, - 2, - -3, - })), - framework::dataset::make("Expected", { false, false, false, true, true, true, false, false })), - input_info, output_info, beta, axis, expected) +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip( + make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching data types + TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching shapes + TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info + QuantizationInfo(1.f/256, 12)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, + QuantizationInfo(1.f/256, 12)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, //Invalid axis high + QuantizationInfo(1.f/256, 12)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, //Invalid axis low + QuantizationInfo(1.f/256, 12)), + }), + make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16), + TensorInfo(TensorShape(27U, 11U), 1, DataType::F32), + TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, + QuantizationInfo(1.f/256, 12)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, + QuantizationInfo(1.f/256, 0)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, + QuantizationInfo(1.f/256, 0)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, + QuantizationInfo(1.f/256, 0)), + }), + make("beta", { 1.0, + 2.0, + 1.0, + 2.0, + 1.0, + 1.0, + 2.0, + 1.0, + }), + make("axis", { 0, + 0, + 0, + 1, + 0, + -1, + 2, + -3, + }), + make("Expected", { false, false, false, true, true, true, false, false })), + input_info, output_info, beta, axis, expected) { ARM_COMPUTE_EXPECT(bool(NESoftmaxLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), beta, axis)) == expected, framework::LogLevel::ERRORS); } @@ -122,54 +121,26 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip( template <typename T> using NESoftmaxLayerFixture = SoftmaxValidationFixture<Tensor, Accessor, NESoftmaxLayer, T>; -DATA_TEST_CASE(KernelSelection_max_logits, framework::DatasetMode::ALL, concat( - combine(framework::dataset::make("CpuExt", std::string("NEON")), - framework::dataset::make("DataType", { DataType::F32, - DataType::F16, - DataType::QASYMM8, - DataType::QASYMM8_SIGNED - })), - combine(framework::dataset::make("CpuExt", std::string("SVE")), - framework::dataset::make("DataType", { DataType::F32, - DataType::F16, - DataType::QASYMM8, - DataType::QASYMM8_SIGNED - }))), - cpu_ext, data_type) -{ - using namespace cpu::kernels; - - cpuinfo::CpuIsaInfo cpu_isa{}; - cpu_isa.neon = (cpu_ext == "NEON"); - cpu_isa.sve = (cpu_ext == "SVE"); - cpu_isa.fp16 = (data_type == DataType::F16); - - const auto *selected_impl = CpuLogits1DMaxKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred); - - ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl); - - std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_logits_1d_max"; - std::string actual = selected_impl->name; - - ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS); -} - -DATA_TEST_CASE(KernelSelection_logits, framework::DatasetMode::ALL, concat(concat( - combine(framework::dataset::make("CpuExt", std::string("NEON")), - framework::dataset::make("DataType", { DataType::F32, - DataType::F16, - DataType::QASYMM8, - DataType::QASYMM8_SIGNED - })), - combine(framework::dataset::make("CpuExt", std::string("SVE")), - framework::dataset::make("DataType", { DataType::F32, - DataType::F16 - }))), - combine(framework::dataset::make("CpuExt", std::string("SVE2")), - framework::dataset::make("DataType", { DataType::QASYMM8, - DataType::QASYMM8_SIGNED - }))), - cpu_ext, data_type) +DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL, + concat(concat( + combine( + make("CpuExt", std::string("NEON")), + make("DataType", { DataType::F32, + DataType::F16, + DataType::QASYMM8, + DataType::QASYMM8_SIGNED}) + ), + combine( + make("CpuExt", std::string("SVE")), + make("DataType", { DataType::F32, + DataType::F16})) + ), + combine( + make("CpuExt", std::string("SVE2")), + make("DataType", { DataType::QASYMM8, + DataType::QASYMM8_SIGNED})) + ), + cpu_ext, data_type) { using namespace cpu::kernels; @@ -179,11 +150,12 @@ DATA_TEST_CASE(KernelSelection_logits, framework::DatasetMode::ALL, concat(conca cpu_isa.sve2 = (cpu_ext == "SVE2"); cpu_isa.fp16 = (data_type == DataType::F16); - const auto *selected_impl = CpuLogits1DSoftmaxKernel<false>::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred); + const auto *selected_impl = CpuSoftmaxKernel::get_implementation( + SoftmaxKernelDataTypeISASelectorData{ data_type, cpu_isa, false /* is_log */ }, cpu::KernelSelectionType::Preferred); ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl); - std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_softmax_logits_1d"; + std::string expected = "neon_" + cpu_impl_dt(data_type) + "_softmax"; std::string actual = selected_impl->name; ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS); @@ -192,26 +164,32 @@ DATA_TEST_CASE(KernelSelection_logits, framework::DatasetMode::ALL, concat(conca TEST_SUITE(Float) #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC TEST_SUITE(FP16) -FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(), - framework::dataset::make("DataType", DataType::F16)), - framework::dataset::make("Beta", { 1.0f, 2.0f })), - framework::dataset::make("Axis", { 0, 1 }))) +FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, + combine( + datasets::Small4DShapes(), + make("DataType", DataType::F16), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0, 1 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f16); } -FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(), - framework::dataset::make("DataType", DataType::F16)), - framework::dataset::make("Beta", { 1.0f, 2.0f })), - framework::dataset::make("Axis", { 0, 2, -1 }))) +FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, + combine( + datasets::Small4DShapes(), + make("DataType", DataType::F16), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0, 2, -1 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f16); } -FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(), - framework::dataset::make("DataType", DataType::F16)), - framework::dataset::make("Beta", { 1.0f, 2.0f })), - framework::dataset::make("Axis", { 0 }))) +FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, + combine( + datasets::SoftmaxLayerLargeShapes(), + make("DataType", DataType::F16), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f16); @@ -220,26 +198,30 @@ TEST_SUITE_END() //FP16 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ TEST_SUITE(FP32) -FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(), - framework::dataset::make("DataType", DataType::F32)), - framework::dataset::make("Beta", { 1.0f, 2.0f })), - framework::dataset::make("Axis", { 0, -1 }))) +FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, + combine( + datasets::SoftmaxLayerSmallShapes(), + make("DataType", DataType::F32), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0, -1 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f32); } -FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(), - framework::dataset::make("DataType", DataType::F32)), - framework::dataset::make("Beta", { 1.0f, 2.0f })), - framework::dataset::make("Axis", { 0, -2, 3 }))) +FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, + combine(datasets::Small4DShapes(), + make("DataType", DataType::F32), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0, -2, 3 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f32); } -FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(), - framework::dataset::make("DataType", DataType::F32)), - framework::dataset::make("Beta", { 1.0f, 2.0f })), - framework::dataset::make("Axis", { 0 }))) +FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, + combine(datasets::SoftmaxLayerLargeShapes(), + make("DataType", DataType::F32), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f32); @@ -252,29 +234,40 @@ using NESoftmaxLayerQuantizedFixture = SoftmaxValidationQuantizedFixture<Tensor, TEST_SUITE(Quantized) TEST_SUITE(QASYMM8) -FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(), - framework::dataset::make("DataType", DataType::QASYMM8)), - combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), - framework::dataset::make("Beta", { 1.0f, 2.f }))), - framework::dataset::make("Axis", { 0, -1 }))) +FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, + combine( + datasets::SoftmaxLayerSmallShapes(), + make("DataType", DataType::QASYMM8), + combine( + make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.f }) + ), + make("Axis", { 0, -1 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8); } -FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::Small4DShapes(), - framework::dataset::make("DataType", DataType::QASYMM8)), - combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), - framework::dataset::make("Beta", { 1.0f, 2.f }))), - framework::dataset::make("Axis", { 0, 1, -2 }))) +FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, + combine( + datasets::Small4DShapes(), + make("DataType", DataType::QASYMM8), + combine( + make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.f })), + make("Axis", { 0, 1, -2 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8); } -FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(), - framework::dataset::make("DataType", DataType::QASYMM8)), - combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), - framework::dataset::make("Beta", { 1.0f, 2.0f }))), - framework::dataset::make("Axis", { 0 }))) +FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, + combine( + datasets::SoftmaxLayerLargeShapes(), + make("DataType", DataType::QASYMM8), + combine( + make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.0f }) + ), + make("Axis", { 0 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8); @@ -282,20 +275,28 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framew TEST_SUITE_END() //QASYMM8 TEST_SUITE(QASYMM8_SIGNED) -FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(), - framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), - combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), - framework::dataset::make("Beta", { 1.0f, 2.f }))), - framework::dataset::make("Axis", { 0, -1 }))) +FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, + combine( + datasets::SoftmaxLayerSmallShapes(), + make("DataType", DataType::QASYMM8_SIGNED), + combine( + make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.f }) + ), + make("Axis", { 0, -1 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8_signed); } -FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::Small4DShapes(), - framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), - combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), - framework::dataset::make("Beta", { 1.0f, 2.f }))), - framework::dataset::make("Axis", { 0, 1, -1 }))) +FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, + combine( + datasets::Small4DShapes(), + make("DataType", DataType::QASYMM8_SIGNED), + combine( + make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.f }) + ), + make("Axis", { 0, 1, -1 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8_signed); |