aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--docs/user_guide/release_version_and_change_log.dox6
-rw-r--r--filelist.json10
-rw-r--r--src/BUILD.bazel10
-rw-r--r--src/CMakeLists.txt6
-rw-r--r--src/core/NEON/wrapper/intrinsics/max.h41
-rw-r--r--src/cpu/kernels/CpuKernelSelectionTypes.h16
-rw-r--r--src/cpu/kernels/CpuSoftmaxKernel.cpp263
-rw-r--r--src/cpu/kernels/CpuSoftmaxKernel.h99
-rw-r--r--src/cpu/kernels/softmax/generic/neon/fp16.cpp21
-rw-r--r--src/cpu/kernels/softmax/generic/neon/fp32.cpp23
-rw-r--r--src/cpu/kernels/softmax/generic/neon/impl.cpp152
-rw-r--r--src/cpu/kernels/softmax/generic/neon/impl.h210
-rw-r--r--src/cpu/kernels/softmax/generic/neon/qasymm8.cpp22
-rw-r--r--src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp23
-rw-r--r--src/cpu/kernels/softmax/generic/sve/fp16.cpp50
-rw-r--r--src/cpu/kernels/softmax/generic/sve/fp32.cpp49
-rw-r--r--src/cpu/kernels/softmax/generic/sve/impl.cpp25
-rw-r--r--src/cpu/kernels/softmax/generic/sve/qasymm8.cpp38
-rw-r--r--src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp38
-rw-r--r--src/cpu/kernels/softmax/generic/sve2/impl.cpp20
-rw-r--r--src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp44
-rw-r--r--src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp44
-rw-r--r--src/cpu/kernels/softmax/list.h31
-rw-r--r--src/cpu/operators/CpuSoftmax.cpp91
-rw-r--r--src/cpu/operators/CpuSoftmax.h39
-rw-r--r--src/runtime/NEON/functions/NESoftmaxLayer.cpp22
-rw-r--r--tests/validation/NEON/SoftmaxLayer.cpp299
27 files changed, 631 insertions, 1061 deletions
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 13f4e9ea2a..ac4f0610ea 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -46,6 +46,8 @@ v24.01 Public major release
You should link only to the main `libarm_compute` library for core functionality.
- New features
- Add support for FP16 in all multi_isa builds.
+ - Performance optimizations:
+ - Optimize @ref NESoftmaxLayer
v23.11 Public major release
- New features
@@ -438,8 +440,8 @@ v21.02 Public major release
- @ref NEActivationLayer
- @ref NEArithmeticAddition
- @ref NEBatchNormalizationLayerKernel
- - @ref cpu::kernels::CpuLogits1DSoftmaxKernel
- - @ref cpu::kernels::CpuLogits1DMaxKernel
+ - cpu::kernels::CpuLogits1DSoftmaxKernel
+ - cpu::kernels::CpuLogits1DMaxKernel
- @ref cpu::kernels::CpuElementwiseUnaryKernel
- Remove padding from OpenCL kernels:
- CLDirectConvolutionLayerKernel
diff --git a/filelist.json b/filelist.json
index c34eff2ff9..60f4285a03 100644
--- a/filelist.json
+++ b/filelist.json
@@ -2213,16 +2213,10 @@
"qasymm8_signed":["src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp"]
},
"sve": {
- "common": [ "src/cpu/kernels/softmax/generic/sve/impl.cpp" ],
- "fp32": ["src/cpu/kernels/softmax/generic/sve/fp32.cpp"],
- "fp16": ["src/cpu/kernels/softmax/generic/sve/fp16.cpp"],
- "qasymm8": ["src/cpu/kernels/softmax/generic/sve/qasymm8.cpp" ],
- "qasymm8_signed": ["src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp"]
+ "common": [ "src/cpu/kernels/softmax/generic/sve/impl.cpp" ]
},
"sve2":{
- "common" :["src/cpu/kernels/softmax/generic/sve2/impl.cpp"],
- "qasymm8":[ "src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp"],
- "qasymm8_signed":["src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp"]
+ "common" :["src/cpu/kernels/softmax/generic/sve2/impl.cpp"]
}
}
},
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index f281b6a4d5..c14e10c836 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel
@@ -117,9 +117,7 @@ filegroup(
"cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp",
"cpu/kernels/elementwise_unary/generic/sve2/q8.cpp",
"cpu/kernels/lut/generic/sve2/u8.cpp",
- "cpu/kernels/softmax/generic/sve2/impl.cpp",
- "cpu/kernels/softmax/generic/sve2/qasymm8.cpp",
- "cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp"] +
+ "cpu/kernels/softmax/generic/sve2/impl.cpp"] +
glob(["**/*.h",
"**/*.hpp",
"**/*.inl"]),
@@ -342,11 +340,7 @@ filegroup(
"cpu/kernels/scale/sve/integer.cpp",
"cpu/kernels/scale/sve/qasymm8.cpp",
"cpu/kernels/scale/sve/qasymm8_signed.cpp",
- "cpu/kernels/softmax/generic/sve/fp16.cpp",
- "cpu/kernels/softmax/generic/sve/fp32.cpp",
- "cpu/kernels/softmax/generic/sve/impl.cpp",
- "cpu/kernels/softmax/generic/sve/qasymm8.cpp",
- "cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp"] +
+ "cpu/kernels/softmax/generic/sve/impl.cpp"] +
glob(["**/*.h",
"**/*.hpp",
"**/*.inl"]),
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3229ffa8c2..e6c6782da1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -317,11 +317,7 @@ target_sources(
cpu/kernels/scale/sve/integer.cpp
cpu/kernels/scale/sve/qasymm8.cpp
cpu/kernels/scale/sve/qasymm8_signed.cpp
- cpu/kernels/softmax/generic/sve/fp16.cpp
- cpu/kernels/softmax/generic/sve/fp32.cpp
cpu/kernels/softmax/generic/sve/impl.cpp
- cpu/kernels/softmax/generic/sve/qasymm8.cpp
- cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
)
target_sources(
@@ -339,8 +335,6 @@ target_sources(
cpu/kernels/elementwise_unary/generic/sve2/q8.cpp
cpu/kernels/lut/generic/sve2/u8.cpp
cpu/kernels/softmax/generic/sve2/impl.cpp
- cpu/kernels/softmax/generic/sve2/qasymm8.cpp
- cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
)
target_sources(
diff --git a/src/core/NEON/wrapper/intrinsics/max.h b/src/core/NEON/wrapper/intrinsics/max.h
index cec437d171..32d38a856c 100644
--- a/src/core/NEON/wrapper/intrinsics/max.h
+++ b/src/core/NEON/wrapper/intrinsics/max.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_WRAPPER_MAX_H
-#define ARM_COMPUTE_WRAPPER_MAX_H
+#ifndef ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H
+#define ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H
#include <arm_neon.h>
@@ -59,6 +59,39 @@ VMAX_IMPL(float16_t, float16x8_t, vmaxq, f16)
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#undef VMAX_IMPL
+
+#if defined(__aarch64__)
+// VMAXV: Across vector max
+#define VMAXV_IMPL(stype, vtype, prefix, postfix) \
+ inline stype vmaxv(const vtype &a) \
+ { \
+ return prefix##_##postfix(a); \
+ }
+
+VMAXV_IMPL(uint8_t, uint8x8_t, vmaxv, u8)
+VMAXV_IMPL(int8_t, int8x8_t, vmaxv, s8)
+VMAXV_IMPL(uint16_t, uint16x4_t, vmaxv, u16)
+VMAXV_IMPL(int16_t, int16x4_t, vmaxv, s16)
+VMAXV_IMPL(uint32_t, uint32x2_t, vmaxv, u32)
+VMAXV_IMPL(int32_t, int32x2_t, vmaxv, s32)
+VMAXV_IMPL(float, float32x2_t, vmaxv, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMAXV_IMPL(float16_t, float16x4_t, vmaxv, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VMAXV_IMPL(uint8_t, uint8x16_t, vmaxvq, u8)
+VMAXV_IMPL(int8_t, int8x16_t, vmaxvq, s8)
+VMAXV_IMPL(uint16_t, uint16x8_t, vmaxvq, u16)
+VMAXV_IMPL(int16_t, int16x8_t, vmaxvq, s16)
+VMAXV_IMPL(uint32_t, uint32x4_t, vmaxvq, u32)
+VMAXV_IMPL(int32_t, int32x4_t, vmaxvq, s32)
+VMAXV_IMPL(float, float32x4_t, vmaxvq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VMAXV_IMPL(float16_t, float16x8_t, vmaxvq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VMAXV_IMPL
+#endif // defined(__aarch64__)
} // namespace wrapper
} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_MAX_H */
+#endif // ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_MAX_H
diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h
index b7daa4d583..45ebeec394 100644
--- a/src/cpu/kernels/CpuKernelSelectionTypes.h
+++ b/src/cpu/kernels/CpuKernelSelectionTypes.h
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H
-#define ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H
+#ifndef ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H
+#define ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H
#include "arm_compute/core/Types.h"
@@ -99,6 +99,13 @@ struct ScaleKernelDataTypeISASelectorData
InterpolationPolicy interpolation_policy;
};
+struct SoftmaxKernelDataTypeISASelectorData
+{
+ DataType dt;
+ cpuinfo::CpuIsaInfo isa;
+ bool is_log;
+};
+
// Selector pointer types
using DataTypeISASelectorPtr = std::add_pointer<bool(const DataTypeISASelectorData &data)>::type;
using DataTypeDataLayoutSelectorPtr = std::add_pointer<bool(const DataTypeDataLayoutISASelectorData &data)>::type;
@@ -113,9 +120,10 @@ using CpuAddKernelDataTypeISASelectorDataPtr =
std::add_pointer<bool(const CpuAddKernelDataTypeISASelectorData &data)>::type;
using ScaleKernelDataTypeISASelectorDataPtr =
std::add_pointer<bool(const ScaleKernelDataTypeISASelectorData &data)>::type;
-
+using SoftmaxKernelDataTypeISASelectorDataPtr =
+ std::add_pointer<bool(const SoftmaxKernelDataTypeISASelectorData &data)>::type;
} // namespace kernels
} // namespace cpu
} // namespace arm_compute
-#endif // ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H
+#endif // ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp
index ce144351f8..486f55e2c1 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.cpp
+++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp
@@ -34,9 +34,12 @@
#include "src/core/common/Registrars.h"
#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/Utils.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/cpu/kernels/softmax/list.h"
+#include <vector>
+
namespace arm_compute
{
namespace cpu
@@ -45,136 +48,40 @@ namespace kernels
{
namespace
{
-/* Softmax Logits 1D Max - identifying the max value of 1D Logits */
-static const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> available_kernels_max_logits = {
- {"sve_fp32_logits_1d_max",
- [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
- REGISTER_FP32_SVE(sve_fp32_logits)},
- {"sve_fp16_logits_1d_max",
- [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
- REGISTER_FP16_SVE(sve_fp16_logits)},
- {"sve_qu8_logits_1d_max",
- [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; },
- REGISTER_QASYMM8_SVE(sve_qasymm8_logits)},
- {"sve_qs8_logits_1d_max",
- [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; },
- REGISTER_QASYMM8_SIGNED_SVE(sve_qasymm8_signed_logits)},
- {"neon_fp32_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
- REGISTER_FP32_NEON(neon_fp32_logits)},
- {"neon_fp16_logits_1d_max",
- [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
- REGISTER_FP16_NEON(neon_fp16_logits)},
- {"neon_qu8_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
- REGISTER_QASYMM8_NEON(neon_qasymm8_logits)},
- {"neon_qs8_logits_1d_max",
- [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
- REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_singed_logits)},
+/* Softmax */
+static const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> available_kernels = {
+ {"neon_fp32_softmax",
+ [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::F32); },
+ REGISTER_FP32_NEON(neon_fp32_softmax<false>)},
+ {"neon_fp16_softmax",
+ [](const SoftmaxKernelDataTypeISASelectorData &data)
+ { return (!data.is_log && data.dt == DataType::F16) && data.isa.fp16; },
+ REGISTER_FP16_NEON(neon_fp16_softmax<false>)},
+ {"neon_qu8_softmax",
+ [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::QASYMM8); },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax<false>)},
+ {"neon_qs8_softmax",
+ [](const SoftmaxKernelDataTypeISASelectorData &data)
+ { return (!data.is_log && data.dt == DataType::QASYMM8_SIGNED); },
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax<false>)},
+ {"neon_fp32_log_softmax",
+ [](const SoftmaxKernelDataTypeISASelectorData &data) { return (data.is_log && data.dt == DataType::F32); },
+ REGISTER_FP32_NEON(neon_fp32_softmax<true>)},
+ {"neon_fp16_log_softmax",
+ [](const SoftmaxKernelDataTypeISASelectorData &data)
+ { return (data.is_log && data.dt == DataType::F16) && data.isa.fp16; },
+ REGISTER_FP16_NEON(neon_fp16_softmax<true>)},
+ {"neon_qu8_log_softmax",
+ [](const SoftmaxKernelDataTypeISASelectorData &data) { return (data.is_log && data.dt == DataType::QASYMM8); },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax<true>)},
+ {"neon_qs8_log_softmax",
+ [](const SoftmaxKernelDataTypeISASelectorData &data)
+ { return (data.is_log && data.dt == DataType::QASYMM8_SIGNED); },
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax<true>)},
};
-Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
- DataType::F16, DataType::F32);
-
- // Validate in case of configured output
- if (output.total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(),
- TensorShape(input.tensor_shape()).set(0, 1));
- }
-
- return Status{};
-}
-} //namespace
-const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> &CpuLogits1DMaxKernel::get_available_kernels()
-{
- return available_kernels_max_logits;
-}
-
-void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst));
-
- // Softmax across the x dimension
- const TensorShape output_shape = TensorShape(src->tensor_shape()).set(0, 1);
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
-
- const auto *uk = get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
- ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
- _run_method = uk->ukernel;
- _name = std::string("CpuLogits1DMaxKernel").append("/").append(uk->name);
-
- Window win = calculate_max_window(*src, Steps());
- ICpuKernel::configure(win);
-}
-
-Status CpuLogits1DMaxKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*src, *dst));
-
- return Status{};
-}
-
-void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
-
- const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
- _run_method(src, dst, window);
-}
-
-const char *CpuLogits1DMaxKernel::name() const
-{
- return _name.c_str();
-}
-
-/* Softmax Logits 1D - computation for QASYMM8 with pre-computed max. */
-template <bool IS_LOG>
-static const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> available_kernels_logits = {
- {"sve2_qu8_softmax_logits_1d",
- [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; },
- REGISTER_QASYMM8_SVE2(sve2_qasymm8_softmax)},
- {"sve2_qs8_softmax_logits_1d",
- [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
- REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_softmax)},
- {"sve_fp32_softmax_logits_1d",
- [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
- REGISTER_FP32_SVE(sve_fp32_softmax)},
- {"sve_fp16_softmax_logits_1d",
- [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
- REGISTER_FP16_SVE(sve_fp16_softmax)},
-
- {"neon_fp32_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
- REGISTER_FP32_NEON(neon_fp32_softmax)},
- {"neon_fp16_softmax_logits_1d",
- [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
- REGISTER_FP16_NEON(neon_fp16_softmax)},
- {"neon_qu8_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)},
- {"neon_qs8_softmax_logits_1d",
- [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)},
-};
-namespace
-{
-Status validate_arguments_logits_softmax(const ITensorInfo &src,
- const ITensorInfo &max,
- const ITensorInfo &dst,
- const float beta,
- const ITensorInfo &tmp,
- bool is_log)
+Status validate_arguments_softmax(
+ const ITensorInfo &src, const ITensorInfo &dst, float beta, const ITensorInfo &tmp, bool is_log)
{
ARM_COMPUTE_UNUSED(beta);
// Check input
@@ -184,11 +91,6 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src,
const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
- // Check max
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(src.tensor_shape()).set(0, 1), max.tensor_shape());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max);
-
// Check output if configured
if (dst.total_size() != 0)
{
@@ -203,8 +105,11 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src,
// Check tmp if configured
if (tmp.total_size() != 0)
{
- const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type();
- ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type);
+ // We have temporary storage only if src data type is quantized.
+ // Therefore, tmp data type must be F32
+ ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(!is_quantized_asymmetric);
+
// We could potentially reduce tmp memory if we could predict or make an assumption
// on the maximum number of threads that will run in parallel.
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &tmp);
@@ -214,91 +119,97 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src,
}
} // namespace
-template <bool IS_LOG>
-const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> &
-CpuLogits1DSoftmaxKernel<IS_LOG>::get_available_kernels()
+const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> &CpuSoftmaxKernel::get_available_kernels()
{
- return available_kernels_logits<IS_LOG>;
+ return available_kernels;
}
-template <bool IS_LOG>
-void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(
- const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp)
+void CpuSoftmaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, bool is_log, ITensorInfo *tmp)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, tmp);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_softmax(*src, *dst, beta, *tmp, is_log));
// Configure kernel window
const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
// Output auto initialization if not yet initialized
const QuantizationInfo output_quantization =
- is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG)
+ is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), is_log)
: dst->quantization_info();
auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding());
- // Tmp auto initialization if not yet initialized
- const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
- auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
+ // Tmp auto initialization if not yet initialized and src is quantized
+ if (is_quantized_asymmetric)
+ {
+ const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
+ auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
+ }
- const auto *uk = CpuLogits1DSoftmaxKernel<IS_LOG>::get_implementation(
- DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
+ const auto *uk = CpuSoftmaxKernel::get_implementation(
+ SoftmaxKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), is_log});
ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
- std::string kernel_name =
- IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel");
+ std::string kernel_name = is_log ? std::string("CpuLogSoftmaxKernel") : std::string("CpuSoftmaxKernel");
_beta = beta;
_run_method = uk->ukernel;
_name = kernel_name.append("/").append(uk->name);
- // Configure kernel window
- Window win = calculate_max_window(*max, Steps());
+ Window win = calculate_max_window(*dst, Steps());
+
+ /// TODO: Check dimensions > 0 for holes only. For this, we need
+ /// a utility function checking if there are holes after some dimension.
+ if (!has_holes(*dst, dst->num_dimensions() - 1))
+ {
+ win = win.collapse(win, Window::DimY);
+ }
- ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>::configure(win);
+ win.set(Window::DimX, Window::Dimension(0, 1, 1)); // First dimension is the reduction axis
+
+ ICpuKernel<CpuSoftmaxKernel>::configure(win);
}
-template <bool IS_LOG>
-Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate(
- const ITensorInfo *src, const ITensorInfo *max, const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
+Status CpuSoftmaxKernel::validate(
+ const ITensorInfo *src, const ITensorInfo *dst, float beta, bool is_log, const ITensorInfo *tmp)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, tmp);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_softmax(*src, *dst, beta, *tmp, is_log));
return Status{};
}
-template <bool IS_LOG>
-void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
{
- ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>::window(), window);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel<CpuSoftmaxKernel>::window(), window);
ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
- auto max = tensors.get_tensor(TensorType::ACL_SRC_1);
auto dst = tensors.get_tensor(TensorType::ACL_DST_0);
- auto tmp = tensors.get_tensor(TensorType::ACL_DST_1);
- const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x();
- const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration;
+ if (is_data_type_quantized_asymmetric(src->info()->data_type()))
+ {
+ auto tmp = tensors.get_tensor(TensorType::ACL_DST_1);
+
+ const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x();
+ const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration;
- ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
+ ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
- void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread);
- _run_method(src, max, tmp_for_thread, dst, _beta, IS_LOG, window);
+ void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread);
+ _run_method(src, tmp_for_thread, dst, _beta, window);
+ }
+ else
+ {
+ _run_method(src, nullptr, dst, _beta, window);
+ }
}
-template <bool IS_LOG>
-const char *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const
+const char *CpuSoftmaxKernel::name() const
{
return _name.c_str();
}
-template class CpuLogits1DSoftmaxKernel<true>;
-template class CpuLogits1DSoftmaxKernel<false>;
-
} // namespace kernels
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h
index 5d288179fd..3db1f3d0ef 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.h
+++ b/src/cpu/kernels/CpuSoftmaxKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H
-#define ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H
+#ifndef ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H
+#define ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuKernel.h"
@@ -33,102 +33,55 @@ namespace cpu
{
namespace kernels
{
-/** Interface for the identifying the max value of 1D Logits */
-class CpuLogits1DMaxKernel : public ICpuKernel<CpuLogits1DMaxKernel>
+/** Interface for softmax computation */
+class CpuSoftmaxKernel : public ICpuKernel<CpuSoftmaxKernel>
{
private:
- using SoftmaxLogits1DMaxKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &)>::type;
+ using SoftmaxKernelPtr =
+ std::add_pointer<void(const ITensor *, void *const, ITensor *, float, const Window &)>::type;
public:
- CpuLogits1DMaxKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DMaxKernel);
- /** Set the input and output tensors.
- *
- * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[out] dst Destination tensor info. Data types supported: same as @p input
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to CpuLogits1DMaxKernel::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-
- struct SoftmaxLogits1DMaxKernel
- {
- const char *name;
- const DataTypeISASelectorPtr is_selected;
- SoftmaxLogits1DMaxKernelPtr ukernel;
- };
-
- static const std::vector<SoftmaxLogits1DMaxKernel> &get_available_kernels();
-
-private:
- SoftmaxLogits1DMaxKernelPtr _run_method{nullptr};
- std::string _name{};
-};
-
-/** Interface for softmax computation for QASYMM8 with pre-computed max. */
-template <bool IS_LOG = false>
-class CpuLogits1DSoftmaxKernel : public ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>
-{
-private:
- using SoftmaxLogits1DKernelPtr = std::add_pointer<void(
- const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
-
-public:
- CpuLogits1DSoftmaxKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DSoftmaxKernel);
+ CpuSoftmaxKernel() = default;
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuSoftmaxKernel);
/** Set the input and output tensors.
*
- * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] max Max values tensor info. Same shape as input with dimension 0 set to 1.
- * Data types supported: same as @p input.
- * @param[out] dst Destination tensor info. Data types supported: same as @p input.
- * @param[in] beta A scaling factor for the exponent.
+ * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[out] dst Destination tensor info. Data types supported: same as @p input.
+ * @param[in] beta A scaling factor for the exponent.
+ * @param[in] is_log True if the operation is log-softmax
*
* @param tmp Auxiliary tensor info. Must be type F32 and same shape as the input.
*/
- void
- configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp);
+ void configure(const ITensorInfo *src, ITensorInfo *dst, float beta, bool is_log, ITensorInfo *tmp);
/** Static function to check if given info will lead to a valid configuration
*
- * Similar to CpuLogits1DSoftmaxKernel::configure()
+ * Similar to CpuSoftmaxKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src,
- const ITensorInfo *max,
- const ITensorInfo *dst,
- const float beta,
- const ITensorInfo *tmp);
+ static Status
+ validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, bool is_log, const ITensorInfo *tmp);
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
- struct SoftmaxLogits1DKernel
+ struct SoftmaxKernel
{
- const char *name;
- const DataTypeISASelectorPtr is_selected;
- SoftmaxLogits1DKernelPtr ukernel;
+ const char *name;
+ const SoftmaxKernelDataTypeISASelectorDataPtr is_selected;
+ SoftmaxKernelPtr ukernel;
};
- static const std::vector<SoftmaxLogits1DKernel> &get_available_kernels();
+ static const std::vector<SoftmaxKernel> &get_available_kernels();
private:
- float _beta{1.0f};
- SoftmaxLogits1DKernelPtr _run_method{nullptr};
- std::string _name{};
+ float _beta{1.0f};
+ SoftmaxKernelPtr _run_method{nullptr};
+ std::string _name{};
};
} // namespace kernels
} // namespace cpu
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H */
+#endif // ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H
diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
index 2e2adf33e0..db8f881712 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
@@ -31,21 +31,18 @@ namespace arm_compute
{
namespace cpu
{
-void neon_fp16_softmax(const ITensor *in,
- const ITensor *max,
- void *const tmp,
- ITensor *out,
- const float beta,
- bool is_log,
- const Window &window)
-{
- return neon_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
-}
-void neon_fp16_logits(const ITensor *in, ITensor *out, const Window &window)
+template <bool IS_LOG>
+void neon_fp16_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
{
- return neon_logits_1d_max<float16_t>(in, out, window);
+ return neon_softmax_float<float16_t, IS_LOG>(in, tmp, out, beta, window);
}
+
+template void
+neon_fp16_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+template void
+neon_fp16_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+
} // namespace cpu
} // namespace arm_compute
#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
index 61df40c1b5..c281d1bf31 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,20 +29,17 @@ namespace arm_compute
{
namespace cpu
{
-void neon_fp32_softmax(const ITensor *in,
- const ITensor *max,
- void *const tmp,
- ITensor *out,
- const float beta,
- bool is_log,
- const Window &window)
-{
- return neon_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
-}
-void neon_fp32_logits(const ITensor *in, ITensor *out, const Window &window)
+template <bool IS_LOG>
+void neon_fp32_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
{
- return neon_logits_1d_max<float>(in, out, window);
+ return neon_softmax_float<float, IS_LOG>(in, tmp, out, beta, window);
}
+
+template void
+neon_fp32_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+template void
+neon_fp32_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp
index 5d6e6a4f80..487f6ae051 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp
@@ -29,43 +29,76 @@ namespace arm_compute
{
namespace cpu
{
-template void neon_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
-template void neon_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
-
-template <typename T>
-void neon_softmax_logits_1d_quantized(
- const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
+template <typename T, bool IS_LOG>
+void neon_softmax_quantized(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window)
{
static_assert(std::is_same<T, qasymm8_t>::value || std::is_same<T, qasymm8_signed_t>::value,
"quantized type should be either qasymm8_t or qasymm8_signed_t.");
- const int start_x = in->info()->valid_region().anchor.x();
const int input_width = in->info()->valid_region().shape.x();
- const float scale_beta = -beta * in->info()->quantization_info().uniform().scale;
- const auto scale_beta_vec = vdupq_n_f32(scale_beta);
+ const float scale_beta = -beta * in->info()->quantization_info().uniform().scale;
+ const float32x4_t scale_beta_vec = vdupq_n_f32(scale_beta);
+
+ Iterator in_it(in, window);
+ Iterator out_it(out, window);
- Iterator in_it(in, window);
- Iterator max_it(max, window);
- Iterator out_it(out, window);
constexpr int vec_size = 16;
+#ifndef __aarch64__
+ const int sum_stages = log2(vec_size >> 1);
+#endif // __aarch64__
+
+ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
execute_window_loop(
window,
[&](const Coordinates &)
{
/* Get pointers */
- const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
- const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
- const auto tmp_ptr = reinterpret_cast<float *>(tmp);
+ const T *in_ptr = reinterpret_cast<const T *>(in_it.ptr());
+ T *out_ptr = reinterpret_cast<T *>(out_it.ptr());
+ float *tmp_ptr = reinterpret_cast<float *>(tmp);
+
+ T max_val;
+
+ /* Compute Max */
+ {
+ // Init max value
+ auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
+ int x = 0;
- float sum{};
- float sum_inversed{};
+ for (; x <= (input_width - vec_size); x += vec_size)
+ {
+ const auto current_value = wrapper::vloadq(in_ptr + x);
+ vec_max = wrapper::vmax(vec_max, current_value);
+ }
+
+#ifdef __aarch64__
+ max_val = wrapper::vmaxv(vec_max);
+#else // __aarch64__
+ auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
+
+ for (int i = 0; i < sum_stages; ++i)
+ {
+ carry_max = wrapper::vpmax(carry_max, carry_max);
+ }
+
+ max_val = wrapper::vgetlane(carry_max, 0);
+#endif // __aarch64__
+
+ // Compute left-over elements
+ for (; x < input_width; ++x)
+ {
+ max_val = std::max(*(in_ptr + x), max_val);
+ }
+ } // Compute Max
+
+ float sum_transformed{};
/* Compute exponentials and sum */
{
/* Get max value */
- const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
/* Init sum to zero */
@@ -80,11 +113,11 @@ void neon_softmax_logits_1d_quantized(
int x = 0;
for (; x <= (input_width - vec_size); x += vec_size)
{
- auto vec_elements = wrapper::vloadq(in_ptr + x);
- vec_elements = wrapper::vqsub(vec_max, vec_elements);
- auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
+ auto vec_elements = wrapper::vloadq(in_ptr + x);
+ vec_elements = wrapper::vqsub(vec_max, vec_elements);
+ float32x4x4_t vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
- if (is_log)
+ if (IS_LOG)
{
vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
@@ -111,17 +144,24 @@ void neon_softmax_logits_1d_quantized(
}
/* Reduce sum */
- const auto sum_16_byte =
+ const float32x4_t sum_16_byte =
vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
+
+ float sum;
+
+#ifdef __aarch64__
+ sum = wrapper::vaddv(sum_16_byte);
+#else // __aarch64__
auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
sum_res = vpadd_f32(sum_res, sum_res);
sum = wrapper::vgetlane(sum_res, 0);
+#endif // __aarch64__
/* Run remaining elements */
for (; x < input_width; ++x)
{
float element{};
- if (is_log)
+ if (IS_LOG)
{
element = (max_val - in_ptr[x]) * scale_beta;
sum += std::exp(element);
@@ -135,19 +175,22 @@ void neon_softmax_logits_1d_quantized(
tmp_ptr[x] = element;
}
- if (!is_log)
+ if (!IS_LOG)
{
- sum_inversed = 256.f / sum;
+ sum_transformed = 256.f / sum;
}
else
{
- sum = std::log(sum);
+ sum_transformed = std::log(sum);
}
- }
+ } // Compute exponentials and sum
/* Normalize exponentials */
{
constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
+
+ const float32x4_t sum_vec = vdupq_n_f32(sum_transformed);
+
/* Loop over row and compute softmax */
int x = 0;
for (; x <= (input_width - vec_size); x += vec_size)
@@ -155,23 +198,23 @@ void neon_softmax_logits_1d_quantized(
using int_vec_type = wrapper::traits::neon_vector_t<T, 16>;
float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
int_vec_type normalized_value{};
- if (is_log)
+ if (IS_LOG)
{
const float32x4x4_t sub = {
- vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
- vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
- vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
- vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
+ vsubq_f32(vec_in.val[0], sum_vec),
+ vsubq_f32(vec_in.val[1], sum_vec),
+ vsubq_f32(vec_in.val[2], sum_vec),
+ vsubq_f32(vec_in.val[3], sum_vec),
};
normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
}
else
{
float32x4x4_t mul = {
- vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
- vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
- vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
- vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
+ vmulq_f32(vec_in.val[0], sum_vec),
+ vmulq_f32(vec_in.val[1], sum_vec),
+ vmulq_f32(vec_in.val[2], sum_vec),
+ vmulq_f32(vec_in.val[3], sum_vec),
};
if (is_qasymm8_signed)
@@ -190,34 +233,31 @@ void neon_softmax_logits_1d_quantized(
/* Run remaining elements */
for (; x < input_width; ++x)
{
- if (is_log)
+ if (IS_LOG)
{
- out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
+ out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum_transformed);
}
else
{
- out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) -
+ out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_transformed) -
(is_qasymm8_signed ? 128.f : 0));
}
}
- }
+ } // Normalize exponentials
},
- in_it, max_it, out_it);
+ in_it, out_it);
}
-template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in,
- const ITensor *max,
- void *const tmp,
- ITensor *out,
- float beta,
- bool is_log,
- const Window &window);
-template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in,
- const ITensor *max,
- void *const tmp,
- ITensor *out,
- float beta,
- bool is_log,
- const Window &window);
+template void neon_softmax_quantized<qasymm8_signed_t, true>(
+ const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
+
+template void neon_softmax_quantized<qasymm8_signed_t, false>(
+ const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
+
+template void neon_softmax_quantized<qasymm8_t, true>(
+ const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
+
+template void neon_softmax_quantized<qasymm8_t, false>(
+ const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.h b/src/cpu/kernels/softmax/generic/neon/impl.h
index 4d9b789297..60380cd233 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.h
+++ b/src/cpu/kernels/softmax/generic/neon/impl.h
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H
-#define SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H
+#ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H
#include "arm_compute/core/Helpers.h"
@@ -33,105 +33,100 @@ namespace arm_compute
{
namespace cpu
{
-template <typename T>
-void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
-{
- /** SIMD vector tag type. */
- using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
- constexpr int window_step_x = 16 / sizeof(T);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- Window win{window};
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input(in, win);
- Iterator output(out, win);
-
- const int sum_stages = log2(window_step_x / 2);
- execute_window_loop(
- win,
- [&](const Coordinates &)
- {
- // Get pointers
- const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
- const auto out_ptr = reinterpret_cast<T *>(output.ptr());
-
- // Init max value
- auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
- int x = window_start_x;
-
- for (; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto current_value = wrapper::vloadq(in_ptr + x);
- vec_max = wrapper::vmax(vec_max, current_value);
- }
- auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
-
- for (int i = 0; i < sum_stages; ++i)
- {
- carry_max = wrapper::vpmax(carry_max, carry_max);
- }
- T max_val = wrapper::vgetlane(carry_max, 0);
- // Compute left-over elements
- for (; x < window_end_x; ++x)
- {
- max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
- }
+#ifdef __aarch64__
+namespace
+{
+// These helper functions are added because vaddv does not exist for fp16,
+// and, therefore, is not part of the wrapper::vaddv interface.
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+inline float16_t wrapper_vaddv(const float16x8_t &a, int sum_stages)
+{
+ auto sum_res = wrapper::vpadd(wrapper::vgethigh(a), wrapper::vgetlow(a));
+ for (int i = 0; i < sum_stages; ++i)
+ {
+ sum_res = wrapper::vpadd(sum_res, sum_res);
+ }
+ return wrapper::vgetlane(sum_res, 0);
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- *out_ptr = max_val;
- },
- input, output);
+inline float wrapper_vaddv(const float32x4_t &a, int sum_stages)
+{
+ ARM_COMPUTE_UNUSED(sum_stages);
+ return wrapper::vaddv(a);
}
+} // namespace
+#endif // __aarch64__
-template <typename T>
-void neon_softmax_logits_1d_quantized(const ITensor *in,
- const ITensor *max,
- void *const tmp,
- ITensor *out,
- float beta,
- bool is_log,
- const Window &window);
-
-template <typename T>
-void neon_softmax_logits_1d_float(const ITensor *in,
- const ITensor *max,
- void *const tmp,
- ITensor *out,
- const float beta,
- bool is_log,
- const Window &window)
+// The template implementation for float data types is stored in the header file because
+// we need all fp16 instantiated code to live in fp16.cpp files.
+template <typename T, bool IS_LOG>
+void neon_softmax_float(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window)
{
- const int start_x = in->info()->valid_region().anchor.x();
+ ARM_COMPUTE_UNUSED(tmp);
+
const int input_width = in->info()->valid_region().shape.x();
Iterator in_it(in, window);
- Iterator max_it(max, window);
Iterator out_it(out, window);
/** SIMD vector tag type. */
using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
- constexpr int vec_size = 16 / sizeof(T);
- const int sum_stages = log2(vec_size / 2);
+ constexpr int vec_size = 16 / sizeof(T);
+
+ const int sum_stages = log2(vec_size >> 1);
+
+ const auto beta_vec = wrapper::vdup_n(static_cast<T>(beta), ExactTagType{});
execute_window_loop(
window,
[&](const Coordinates &)
{
/* Get pointers */
- const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
- const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
- const auto tmp_ptr = reinterpret_cast<T *>(tmp);
+ const T *in_ptr = reinterpret_cast<const T *>(in_it.ptr());
+ T *out_ptr = reinterpret_cast<T *>(out_it.ptr());
+
+ T max_val;
+
+ /* Compute Max */
+ {
+ // Init max value
+ auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
+ int x = 0;
+
+ for (; x <= (input_width - vec_size); x += vec_size)
+ {
+ const auto current_value = wrapper::vloadq(in_ptr + x);
+ vec_max = wrapper::vmax(vec_max, current_value);
+ }
+
+#ifdef __aarch64__
+ max_val = wrapper::vmaxv(vec_max);
+#else // __aarch64__
+ auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
+
+ for (int i = 0; i < sum_stages; ++i)
+ {
+ carry_max = wrapper::vpmax(carry_max, carry_max);
+ }
+
+ max_val = wrapper::vgetlane(carry_max, 0);
+#endif // __aarch64__
- T sum{};
- T sum_inversed{};
+ // Compute left-over elements
+ for (; x < input_width; ++x)
+ {
+ max_val = std::max(*(in_ptr + x), max_val);
+ }
+ } // compute max
+
+ T sum_transformed{};
/* Compute exponentials and sum */
{
/* Get max value */
- const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
/* Init sum to zero */
@@ -143,35 +138,38 @@ void neon_softmax_logits_1d_float(const ITensor *in,
{
auto vec_elements = wrapper::vloadq(in_ptr + x);
vec_elements = wrapper::vsub(vec_elements, vec_max);
- if (is_log)
+ if (IS_LOG)
{
- vec_elements =
- wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
- vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
+ vec_elements = wrapper::vmul(vec_elements, beta_vec);
+ vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
}
else
{
- vec_elements = wrapper::vexpq(
- wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
- vec_sum = wrapper::vadd(vec_sum, vec_elements);
+ vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, beta_vec));
+ vec_sum = wrapper::vadd(vec_sum, vec_elements);
}
- wrapper::vstore(tmp_ptr + x, vec_elements);
+ wrapper::vstore(out_ptr + x, vec_elements);
}
/* Reduce sum */
+ T sum{};
+#ifdef __aarch64__
+ sum = wrapper_vaddv(vec_sum, sum_stages);
+#else // __aarch64__
auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
for (int i = 0; i < sum_stages; ++i)
{
sum_res = wrapper::vpadd(sum_res, sum_res);
}
sum = wrapper::vgetlane(sum_res, 0);
+#endif // __aarch64__
/* Run remaining elements */
for (; x < input_width; ++x)
{
T element{};
- if (is_log)
+ if (IS_LOG)
{
element = (in_ptr[x] - max_val) * beta;
sum += std::exp(element);
@@ -181,55 +179,59 @@ void neon_softmax_logits_1d_float(const ITensor *in,
element = std::exp((in_ptr[x] - max_val) * beta);
sum += element;
}
- tmp_ptr[x] = element;
+
+ out_ptr[x] = element;
}
- if (!is_log)
+ if (!IS_LOG)
{
- sum_inversed = T(1) / sum;
+ sum_transformed = T(1) / sum;
}
else
{
- sum = static_cast<T>(std::log(sum));
+ sum_transformed = static_cast<T>(std::log(sum));
}
- }
+ } // Compute exponentials and sum
/* Normalize exponentials */
{
+ const auto sum_vec = wrapper::vdup_n(static_cast<T>(sum_transformed), ExactTagType{});
+
/* Loop over row and compute softmax */
int x = 0;
for (; x <= (input_width - vec_size); x += vec_size)
{
- auto vec_in = wrapper::vloadq(tmp_ptr + x);
- auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
- if (is_log)
+ const auto vec_in = wrapper::vloadq(out_ptr + x);
+ if (IS_LOG)
{
- normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
+ wrapper::vstore(out_ptr + x, wrapper::vsub(vec_in, sum_vec));
}
else
{
- normalized_value =
- wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
+ wrapper::vstore(out_ptr + x, wrapper::vmul(vec_in, sum_vec));
}
- wrapper::vstore(out_ptr + x, normalized_value);
}
+
/* Run remaining elements */
for (; x < input_width; ++x)
{
- if (is_log)
+ if (IS_LOG)
{
- out_ptr[x] = tmp_ptr[x] - sum;
+ out_ptr[x] = out_ptr[x] - sum_transformed;
}
else
{
- out_ptr[x] = tmp_ptr[x] * sum_inversed;
+ out_ptr[x] = out_ptr[x] * sum_transformed;
}
}
- }
+ } // Normalize exponentials
},
- in_it, max_it, out_it);
+ in_it, out_it);
}
+
+template <typename T, bool IS_LOG>
+void neon_softmax_quantized(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window);
} // namespace cpu
} // namespace arm_compute
-#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H */
+#endif // ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
index 40713dc496..9589ebcd7c 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,20 +29,16 @@ namespace arm_compute
{
namespace cpu
{
-void neon_qasymm8_softmax(const ITensor *in,
- const ITensor *max,
- void *const tmp,
- ITensor *out,
- const float beta,
- bool is_log,
- const Window &window)
+template <bool IS_LOG>
+void neon_qasymm8_softmax(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
{
- return neon_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
+ return neon_softmax_quantized<qasymm8_t, IS_LOG>(in, tmp, out, beta, window);
}
-void neon_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window)
-{
- return neon_logits_1d_max<qasymm8_t>(in, out, window);
-}
+template void
+neon_qasymm8_softmax<true>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+template void
+neon_qasymm8_softmax<false>(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
index 2c5e284f54..0bf6b2859a 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,20 +29,17 @@ namespace arm_compute
{
namespace cpu
{
-void neon_qasymm8_signed_softmax(const ITensor *in,
- const ITensor *max,
- void *const tmp,
- ITensor *out,
- const float beta,
- bool is_log,
- const Window &window)
+template <bool IS_LOG>
+void neon_qasymm8_signed_softmax(
+ const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
{
- return neon_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
+ return neon_softmax_quantized<qasymm8_signed_t, IS_LOG>(in, tmp, out, beta, window);
}
-void neon_qasymm8_singed_logits(const ITensor *in, ITensor *out, const Window &window)
-{
- return neon_logits_1d_max<qasymm8_signed_t>(in, out, window);
-}
+template void neon_qasymm8_signed_softmax<true>(
+ const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+template void neon_qasymm8_signed_softmax<false>(
+ const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window);
+
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/fp16.cpp b/src/cpu/kernels/softmax/generic/sve/fp16.cpp
deleted file mode 100644
index 5e94f72faf..0000000000
--- a/src/cpu/kernels/softmax/generic/sve/fp16.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2021-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/CpuTypes.h"
-#include "src/cpu/kernels/softmax/generic/sve/impl.h"
-namespace arm_compute
-{
-namespace cpu
-{
-void sve_fp16_softmax(const ITensor *in,
- const ITensor *max,
- void *const tmp,
- ITensor *out,
- const float beta,
- bool is_log,
- const Window &window)
-{
- return sve_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
-}
-
-void sve_fp16_logits(const ITensor *in, ITensor *out, const Window &window)
-{
- return sve_logits_1d_max<float16_t>(in, out, window);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/softmax/generic/sve/fp32.cpp b/src/cpu/kernels/softmax/generic/sve/fp32.cpp
deleted file mode 100644
index d692cc2477..0000000000
--- a/src/cpu/kernels/softmax/generic/sve/fp32.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve_fp32_softmax(const ITensor *in,
- const ITensor *max,
- void *const tmp,
- ITensor *out,
- const float beta,
- bool is_log,
- const Window &window)
-{
- return sve_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
-}
-
-void sve_fp32_logits(const ITensor *in, ITensor *out, const Window &window)
-{
- return sve_logits_1d_max<float>(in, out, window);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/impl.cpp b/src/cpu/kernels/softmax/generic/sve/impl.cpp
index 24f1bb8143..0d4b7f4509 100644
--- a/src/cpu/kernels/softmax/generic/sve/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/impl.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,6 +30,9 @@ namespace arm_compute
{
namespace cpu
{
+/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to
+/// a single kernel that performs softmax operation. Leaving the SVE code here for
+/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500
template <typename ScalarType>
void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
{
@@ -172,25 +175,5 @@ void sve_softmax_logits_1d_float(const ITensor *in,
},
in_it, max_it, out_it);
}
-
-template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
-
-template void sve_softmax_logits_1d_float<float>(const ITensor *in,
- const ITensor *max,
- void *const tmp,
- ITensor *out,
- const float beta,
- bool is_log,
- const Window &window);
-template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in,
- const ITensor *max,
- void *const tmp,
- ITensor *out,
- const float beta,
- bool is_log,
- const Window &window);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
deleted file mode 100644
index 85e5ccfea1..0000000000
--- a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window)
-{
- return sve_logits_1d_max<qasymm8_t>(in, out, window);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
deleted file mode 100644
index 4be2e2eed6..0000000000
--- a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve_qasymm8_signed_logits(const ITensor *in, ITensor *out, const Window &window)
-{
- return sve_logits_1d_max<qasymm8_signed_t>(in, out, window);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.cpp b/src/cpu/kernels/softmax/generic/sve2/impl.cpp
index 98b2f5117f..a8fb1d4adf 100644
--- a/src/cpu/kernels/softmax/generic/sve2/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/sve2/impl.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,6 +32,9 @@ namespace arm_compute
{
namespace cpu
{
+/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to
+/// a single kernel that performs softmax operation. Leaving the SVE2 code here for
+/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500
template <typename ScalarType>
void sve2_softmax_logits_1d_quantized(
const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
@@ -205,20 +208,5 @@ void sve2_softmax_logits_1d_quantized(
},
in_it, max_it, out_it);
}
-
-template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in,
- const ITensor *max,
- void *const tmp,
- ITensor *out,
- float beta,
- bool is_log,
- const Window &window);
-template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in,
- const ITensor *max,
- void *const tmp,
- ITensor *out,
- float beta,
- bool is_log,
- const Window &window);
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
deleted file mode 100644
index 95623786b3..0000000000
--- a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve2/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve2_qasymm8_softmax(const ITensor *in,
- const ITensor *max,
- void *const tmp,
- ITensor *out,
- const float beta,
- bool is_log,
- const Window &window)
-{
- return sve2_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
deleted file mode 100644
index c20462fcef..0000000000
--- a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2021-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-
-#include "src/cpu/kernels/softmax/generic/sve2/impl.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sve2_qasymm8_signed_softmax(const ITensor *in,
- const ITensor *max,
- void *const tmp,
- ITensor *out,
- const float beta,
- bool is_log,
- const Window &window)
-{
- return sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/list.h b/src/cpu/kernels/softmax/list.h
index 627ce0c264..c143f6659d 100644
--- a/src/cpu/kernels/softmax/list.h
+++ b/src/cpu/kernels/softmax/list.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,41 +21,24 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
-#define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
+#ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H
+#define ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H
namespace arm_compute
{
namespace cpu
{
-#define DECLARE_SOFTMAX_KERNEL(func_name) \
- void func_name(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, \
- bool is_log, const Window &window)
+#define DECLARE_SOFTMAX_KERNEL(func_name) \
+ template <bool IS_LOG> \
+ void func_name(const ITensor *in, void *const tmp, ITensor *out, const float beta, const Window &window)
DECLARE_SOFTMAX_KERNEL(neon_fp32_softmax);
DECLARE_SOFTMAX_KERNEL(neon_fp16_softmax);
DECLARE_SOFTMAX_KERNEL(neon_qasymm8_softmax);
DECLARE_SOFTMAX_KERNEL(neon_qasymm8_signed_softmax);
-DECLARE_SOFTMAX_KERNEL(sve_fp32_softmax);
-DECLARE_SOFTMAX_KERNEL(sve_fp16_softmax);
-DECLARE_SOFTMAX_KERNEL(sve2_qasymm8_signed_softmax);
-DECLARE_SOFTMAX_KERNEL(sve2_qasymm8_softmax);
#undef DECLARE_SOFTMAX_KERNEL
-
-#define DECLARE_LOGITS_KERNEL(func_name) void func_name(const ITensor *in, ITensor *out, const Window &window)
-
-DECLARE_LOGITS_KERNEL(neon_fp32_logits);
-DECLARE_LOGITS_KERNEL(neon_fp16_logits);
-DECLARE_LOGITS_KERNEL(neon_qasymm8_logits);
-DECLARE_LOGITS_KERNEL(neon_qasymm8_singed_logits);
-DECLARE_LOGITS_KERNEL(sve_fp32_logits);
-DECLARE_LOGITS_KERNEL(sve_fp16_logits);
-DECLARE_LOGITS_KERNEL(sve_qasymm8_logits);
-DECLARE_LOGITS_KERNEL(sve_qasymm8_signed_logits);
-
-#undef DECLARE_LOGITS_KERNEL
} // namespace cpu
} // namespace arm_compute
-#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H */
+#endif // ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H
diff --git a/src/cpu/operators/CpuSoftmax.cpp b/src/cpu/operators/CpuSoftmax.cpp
index e55d7f903e..ae14381ad9 100644
--- a/src/cpu/operators/CpuSoftmax.cpp
+++ b/src/cpu/operators/CpuSoftmax.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,13 +41,10 @@ namespace arm_compute
{
namespace cpu
{
-template <bool IS_LOG>
-CpuSoftmaxGeneric<IS_LOG>::CpuSoftmaxGeneric()
+CpuSoftmaxGeneric::CpuSoftmaxGeneric()
: _permute_input(),
_permute_output(),
- _max_kernel(),
_softmax_kernel(),
- _max(),
_tmp(),
_input_permuted(),
_output_permuted(),
@@ -56,8 +53,7 @@ CpuSoftmaxGeneric<IS_LOG>::CpuSoftmaxGeneric()
{
}
-template <bool IS_LOG>
-void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis)
+void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis, bool is_log)
{
// Perform validation step
ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -79,29 +75,23 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
// or it is the original input case (2D case)
const ITensorInfo *tmp_input = (_needs_permute ? &_input_permuted : src);
- // Create intermediate tensors shapes
- TensorShape max_sum_shape = tmp_input->tensor_shape();
- max_sum_shape.set(0, 1);
- const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true);
- DataType tmp_data_type =
- is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type();
- TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
- TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape));
+ TensorInfo tensor_info_tmp;
+ if (is_data_type_quantized_asymmetric(src->data_type()))
+ {
+ // Create intermediate tensors shapes
+ const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true);
+ tensor_info_tmp = input_info.clone()->set_data_type(DataType::F32);
+ }
// Init intermediate tensors
- _max = TensorInfo(max_info);
_tmp = TensorInfo(tensor_info_tmp);
// Configure kernels
- auto mk = std::make_unique<kernels::CpuLogits1DMaxKernel>();
- mk->configure(tmp_input, &_max);
- _max_kernel = std::move(mk);
-
- auto sm = std::make_unique<kernels::CpuLogits1DSoftmaxKernel<IS_LOG>>();
+ auto sm = std::make_unique<kernels::CpuSoftmaxKernel>();
if (_needs_permute)
{
// The normalization kernel stores the result in a permuted output tensor
- sm->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp);
+ sm->configure(tmp_input, &_output_permuted, beta, is_log, &_tmp);
// Re-permute the permuted output into the requested (4D) output
_permute_output.configure(&_output_permuted, dst,
@@ -110,14 +100,15 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
else
{
// Softmax 2D case
- sm->configure(tmp_input, &_max, dst, beta, &_tmp);
+ sm->configure(tmp_input, dst, beta, is_log, &_tmp);
}
_softmax_kernel = std::move(sm);
- _aux_mem[InternalTensorIdx::MAX] =
- MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size());
- _aux_mem[InternalTensorIdx::TMP] =
- MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
+ if (_tmp.total_size() > 0)
+ {
+ _aux_mem[InternalTensorIdx::TMP] =
+ MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
+ }
_aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC),
MemoryLifetime::Temporary, _input_permuted.total_size());
@@ -125,8 +116,8 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
MemoryLifetime::Temporary, _output_permuted.total_size());
}
-template <bool IS_LOG>
-Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis)
+Status
+CpuSoftmaxGeneric::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis, bool is_log)
{
// Perform validation step
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
@@ -136,17 +127,12 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor
static_cast<int32_t>(src->num_dimensions()) <= axis);
// Create intermediate tensor info
- DataType tmp_data_type = src->data_type();
- const TensorInfo tensor_info_tmp(src->clone()->set_data_type(tmp_data_type).set_is_resizable(true));
-
- TensorShape max_sum_shape = src->tensor_shape();
- max_sum_shape.set(0, 1);
- const TensorInfo tensor_info_max_sum(src->clone()
- ->set_tensor_shape(max_sum_shape)
- .set_data_type(tmp_data_type)
- .set_quantization_info(src->quantization_info())
- .set_is_resizable(true));
- const TensorInfo dont_care;
+ TensorInfo tensor_info_tmp;
+
+ if (is_data_type_quantized_asymmetric(src->data_type()))
+ {
+ tensor_info_tmp = src->clone()->set_data_type(DataType::F32).set_is_resizable(true);
+ }
const unsigned int actual_axis =
static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
@@ -165,15 +151,12 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor
ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&output_permuted, dst, permutation_vector));
}
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DMaxKernel::validate(src, &tensor_info_max_sum));
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate(
- &tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care));
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuSoftmaxKernel::validate(src, dst, beta, is_log, &tensor_info_tmp));
return Status{};
}
-template <bool IS_LOG>
-void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
+void CpuSoftmaxGeneric::run(ITensorPack &tensors)
{
ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
@@ -181,13 +164,11 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
auto dst = tensors.get_tensor(TensorType::ACL_DST);
CpuAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp, tensors, true);
- CpuAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max, tensors, true);
CpuAuxTensorHandler input_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _input_permuted, tensors, true);
CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors,
true);
- ITensorPack max_pack;
ITensorPack softmax_pack;
if (_needs_permute)
@@ -195,24 +176,15 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
ITensorPack permute_in_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, input_permuted.get()}};
_permute_input.run(permute_in_pack);
- max_pack = {{TensorType::ACL_SRC, input_permuted.get()}, {TensorType::ACL_DST, max.get()}};
-
softmax_pack = {{TensorType::ACL_SRC_0, input_permuted.get()},
- {TensorType::ACL_SRC_1, max.get()},
{TensorType::ACL_DST_0, output_permuted.get()},
{TensorType::ACL_DST_1, tmp.get()}};
}
else
{
- max_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, max.get()}};
-
- softmax_pack = {{TensorType::ACL_SRC_0, src},
- {TensorType::ACL_SRC_1, max.get()},
- {TensorType::ACL_DST_0, dst},
- {TensorType::ACL_DST_1, tmp.get()}};
+ softmax_pack = {{TensorType::ACL_SRC_0, src}, {TensorType::ACL_DST_0, dst}, {TensorType::ACL_DST_1, tmp.get()}};
}
- NEScheduler::get().schedule_op(_max_kernel.get(), Window::DimY, _max_kernel->window(), max_pack);
NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack);
if (_needs_permute)
@@ -224,13 +196,10 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
}
}
-template <bool IS_LOG>
-experimental::MemoryRequirements CpuSoftmaxGeneric<IS_LOG>::workspace() const
+experimental::MemoryRequirements CpuSoftmaxGeneric::workspace() const
{
return _aux_mem;
}
-template class CpuSoftmaxGeneric<false>;
-template class CpuSoftmaxGeneric<true>;
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h
index 8cab70e14f..47020e9b7c 100644
--- a/src/cpu/operators/CpuSoftmax.h
+++ b/src/cpu/operators/CpuSoftmax.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CPU_SOFTMAX_H
-#define ARM_COMPUTE_CPU_SOFTMAX_H
+#ifndef ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H
+#define ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H
#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/core/TensorInfo.h"
@@ -37,9 +37,7 @@ namespace arm_compute
{
namespace cpu
{
-class CpuLogits1DMaxKernel;
-template <bool IS_LOG>
-class CpuLogits1DSoftmaxKernel;
+class CpuSoftmaxKernel;
/** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer.
*
@@ -52,31 +50,31 @@ class CpuLogits1DSoftmaxKernel;
* This function runs the following function/kernels:
* -# If axis is not 0:
* -# @ref CpuPermute
- * -# @ref kernels::CpuLogits1DMaxKernel
- * -# @ref kernels::CpuLogits1DSoftmaxKernel
+ * -# @ref kernels::CpuSoftmaxKernel
*/
-template <bool IS_LOG = false>
class CpuSoftmaxGeneric : public ICpuOperator
{
public:
CpuSoftmaxGeneric();
/** Set the input and output tensors.
*
- * @param[in,out] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * last value of each row to the nearest multiple.
- * @param[out] dst Destination tensor ifo. Data types supported: same as @p input.
- * @param[in] beta (Optional) A scaling factor for the exponent.
- * @param[in] axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+ * @param[in,out] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * last value of each row to the nearest multiple.
+ * @param[out] dst Destination tensor ifo. Data types supported: same as @p input.
+ * @param[in] beta (Optional) A scaling factor for the exponent.
+ * @param[in] axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
* axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
+ * @param[in] is_log True if the operation is log-softmax
*/
- void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0);
+ void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref CpuSoftmaxGeneric::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0);
+ static Status
+ validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false);
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
@@ -85,8 +83,7 @@ public:
private:
enum InternalTensorIdx
{
- MAX = 0,
- TMP,
+ TMP = 0,
PERMUTED_SRC,
PERMUTED_DST,
COUNT
@@ -94,10 +91,8 @@ private:
CpuPermute _permute_input;
CpuPermute _permute_output;
- std::unique_ptr<ICPPKernel> _max_kernel;
std::unique_ptr<ICPPKernel> _softmax_kernel;
- TensorInfo _max;
TensorInfo _tmp;
TensorInfo _input_permuted;
TensorInfo _output_permuted;
@@ -105,9 +100,7 @@ private:
bool _needs_permute;
experimental::MemoryRequirements _aux_mem{};
};
-using CpuSoftmax = CpuSoftmaxGeneric<false>;
-using CpuLogSoftmax = CpuSoftmaxGeneric<true>;
} // namespace cpu
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SOFTMAX_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index e3c2012d05..be588c5b52 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,7 +29,6 @@
#include "src/core/helpers/MemoryHelpers.h"
#include "src/core/helpers/SoftmaxHelpers.h"
-#include "src/cpu/kernels/CpuSoftmaxKernel.h"
#include "src/cpu/operators/CpuSoftmax.h"
namespace arm_compute
@@ -37,13 +36,12 @@ namespace arm_compute
template <bool IS_LOG>
struct NESoftmaxLayerGeneric<IS_LOG>::Impl
{
- const ITensor *src{nullptr};
- ITensor *dst{nullptr};
- Tensor max{nullptr};
- std::unique_ptr<cpu::CpuSoftmaxGeneric<IS_LOG>> op{nullptr};
- MemoryGroup memory_group{};
- ITensorPack run_pack{};
- WorkspaceData<Tensor> workspace_tensors{};
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuSoftmaxGeneric> op{nullptr};
+ MemoryGroup memory_group{};
+ ITensorPack run_pack{};
+ WorkspaceData<Tensor> workspace_tensors{};
};
template <bool IS_LOG>
@@ -67,8 +65,8 @@ void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, f
_impl->src = input;
_impl->dst = output;
- _impl->op = std::make_unique<cpu::CpuSoftmaxGeneric<IS_LOG>>();
- _impl->op->configure(input->info(), output->info(), beta, axis);
+ _impl->op = std::make_unique<cpu::CpuSoftmaxGeneric>();
+ _impl->op->configure(input->info(), output->info(), beta, axis, IS_LOG);
_impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}};
_impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
@@ -79,7 +77,7 @@ Status
NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric<IS_LOG>::validate(input, output, beta, axis));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric::validate(input, output, beta, axis, IS_LOG));
return Status{};
}
diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp
index b372bdf3fa..2397d81547 100644
--- a/tests/validation/NEON/SoftmaxLayer.cpp
+++ b/tests/validation/NEON/SoftmaxLayer.cpp
@@ -22,14 +22,12 @@
* SOFTWARE.
*/
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/StringUtils.h"
#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
#include "arm_compute/runtime/Tensor.h"
#include "arm_compute/runtime/TensorAllocator.h"
#include "src/common/cpuinfo/CpuIsaInfo.h"
#include "src/cpu/kernels/CpuSoftmaxKernel.h"
#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
#include "tests/datasets/ShapeDatasets.h"
#include "tests/framework/Asserts.h"
#include "tests/framework/Macros.h"
@@ -42,6 +40,7 @@ namespace test
{
namespace validation
{
+using framework::dataset::make;
namespace
{
/** Tolerance for float operations */
@@ -53,7 +52,7 @@ constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
constexpr AbsoluteTolerance<int8_t> tolerance_qasymm8_signed(1);
/** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
+const auto CNNDataTypes = make("DataType",
{
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
DataType::F16,
@@ -66,53 +65,53 @@ TEST_SUITE(NEON)
TEST_SUITE(SoftmaxLayer)
// *INDENT-OFF*
// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
- framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching data types
- TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching shapes
- TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info
- QuantizationInfo(1.f/256, 12)),
- TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
- TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
- QuantizationInfo(1.f/256, 12)),
- TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
- TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, //Invalid axis high
- QuantizationInfo(1.f/256, 12)),
- TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, //Invalid axis low
- QuantizationInfo(1.f/256, 12)),
- }),
- framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
- TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
- TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8,
- QuantizationInfo(1.f/256, 12)),
- TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
- TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
- QuantizationInfo(1.f/256, 0)),
- TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
- TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
- QuantizationInfo(1.f/256, 0)),
- TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
- QuantizationInfo(1.f/256, 0)),
- })),
- framework::dataset::make("beta", { 1.0,
- 2.0,
- 1.0,
- 2.0,
- 1.0,
- 1.0,
- 2.0,
- 1.0,
- })),
- framework::dataset::make("axis", { 0,
- 0,
- 0,
- 1,
- 0,
- -1,
- 2,
- -3,
- })),
- framework::dataset::make("Expected", { false, false, false, true, true, true, false, false })),
- input_info, output_info, beta, axis, expected)
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+ make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching data types
+ TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching shapes
+ TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info
+ QuantizationInfo(1.f/256, 12)),
+ TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+ TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+ QuantizationInfo(1.f/256, 12)),
+ TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+ TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, //Invalid axis high
+ QuantizationInfo(1.f/256, 12)),
+ TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, //Invalid axis low
+ QuantizationInfo(1.f/256, 12)),
+ }),
+ make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
+ TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
+ TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8,
+ QuantizationInfo(1.f/256, 12)),
+ TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+ TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+ QuantizationInfo(1.f/256, 0)),
+ TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+ TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+ QuantizationInfo(1.f/256, 0)),
+ TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+ QuantizationInfo(1.f/256, 0)),
+ }),
+ make("beta", { 1.0,
+ 2.0,
+ 1.0,
+ 2.0,
+ 1.0,
+ 1.0,
+ 2.0,
+ 1.0,
+ }),
+ make("axis", { 0,
+ 0,
+ 0,
+ 1,
+ 0,
+ -1,
+ 2,
+ -3,
+ }),
+ make("Expected", { false, false, false, true, true, true, false, false })),
+ input_info, output_info, beta, axis, expected)
{
ARM_COMPUTE_EXPECT(bool(NESoftmaxLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), beta, axis)) == expected, framework::LogLevel::ERRORS);
}
@@ -122,54 +121,26 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
template <typename T>
using NESoftmaxLayerFixture = SoftmaxValidationFixture<Tensor, Accessor, NESoftmaxLayer, T>;
-DATA_TEST_CASE(KernelSelection_max_logits, framework::DatasetMode::ALL, concat(
- combine(framework::dataset::make("CpuExt", std::string("NEON")),
- framework::dataset::make("DataType", { DataType::F32,
- DataType::F16,
- DataType::QASYMM8,
- DataType::QASYMM8_SIGNED
- })),
- combine(framework::dataset::make("CpuExt", std::string("SVE")),
- framework::dataset::make("DataType", { DataType::F32,
- DataType::F16,
- DataType::QASYMM8,
- DataType::QASYMM8_SIGNED
- }))),
- cpu_ext, data_type)
-{
- using namespace cpu::kernels;
-
- cpuinfo::CpuIsaInfo cpu_isa{};
- cpu_isa.neon = (cpu_ext == "NEON");
- cpu_isa.sve = (cpu_ext == "SVE");
- cpu_isa.fp16 = (data_type == DataType::F16);
-
- const auto *selected_impl = CpuLogits1DMaxKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
-
- ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
-
- std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_logits_1d_max";
- std::string actual = selected_impl->name;
-
- ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
-}
-
-DATA_TEST_CASE(KernelSelection_logits, framework::DatasetMode::ALL, concat(concat(
- combine(framework::dataset::make("CpuExt", std::string("NEON")),
- framework::dataset::make("DataType", { DataType::F32,
- DataType::F16,
- DataType::QASYMM8,
- DataType::QASYMM8_SIGNED
- })),
- combine(framework::dataset::make("CpuExt", std::string("SVE")),
- framework::dataset::make("DataType", { DataType::F32,
- DataType::F16
- }))),
- combine(framework::dataset::make("CpuExt", std::string("SVE2")),
- framework::dataset::make("DataType", { DataType::QASYMM8,
- DataType::QASYMM8_SIGNED
- }))),
- cpu_ext, data_type)
+DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
+ concat(concat(
+ combine(
+ make("CpuExt", std::string("NEON")),
+ make("DataType", { DataType::F32,
+ DataType::F16,
+ DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED})
+ ),
+ combine(
+ make("CpuExt", std::string("SVE")),
+ make("DataType", { DataType::F32,
+ DataType::F16}))
+ ),
+ combine(
+ make("CpuExt", std::string("SVE2")),
+ make("DataType", { DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED}))
+ ),
+ cpu_ext, data_type)
{
using namespace cpu::kernels;
@@ -179,11 +150,12 @@ DATA_TEST_CASE(KernelSelection_logits, framework::DatasetMode::ALL, concat(conca
cpu_isa.sve2 = (cpu_ext == "SVE2");
cpu_isa.fp16 = (data_type == DataType::F16);
- const auto *selected_impl = CpuLogits1DSoftmaxKernel<false>::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
+ const auto *selected_impl = CpuSoftmaxKernel::get_implementation(
+ SoftmaxKernelDataTypeISASelectorData{ data_type, cpu_isa, false /* is_log */ }, cpu::KernelSelectionType::Preferred);
ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
- std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_softmax_logits_1d";
+ std::string expected = "neon_" + cpu_impl_dt(data_type) + "_softmax";
std::string actual = selected_impl->name;
ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
@@ -192,26 +164,32 @@ DATA_TEST_CASE(KernelSelection_logits, framework::DatasetMode::ALL, concat(conca
TEST_SUITE(Float)
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
- framework::dataset::make("DataType", DataType::F16)),
- framework::dataset::make("Beta", { 1.0f, 2.0f })),
- framework::dataset::make("Axis", { 0, 1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+ combine(
+ datasets::Small4DShapes(),
+ make("DataType", DataType::F16),
+ make("Beta", { 1.0f, 2.0f }),
+ make("Axis", { 0, 1 })))
{
// Validate output
validate(Accessor(_target), _reference, tolerance_f16);
}
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
- framework::dataset::make("DataType", DataType::F16)),
- framework::dataset::make("Beta", { 1.0f, 2.0f })),
- framework::dataset::make("Axis", { 0, 2, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+ combine(
+ datasets::Small4DShapes(),
+ make("DataType", DataType::F16),
+ make("Beta", { 1.0f, 2.0f }),
+ make("Axis", { 0, 2, -1 })))
{
// Validate output
validate(Accessor(_target), _reference, tolerance_f16);
}
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
- framework::dataset::make("DataType", DataType::F16)),
- framework::dataset::make("Beta", { 1.0f, 2.0f })),
- framework::dataset::make("Axis", { 0 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+ combine(
+ datasets::SoftmaxLayerLargeShapes(),
+ make("DataType", DataType::F16),
+ make("Beta", { 1.0f, 2.0f }),
+ make("Axis", { 0 })))
{
// Validate output
validate(Accessor(_target), _reference, tolerance_f16);
@@ -220,26 +198,30 @@ TEST_SUITE_END() //FP16
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
- framework::dataset::make("DataType", DataType::F32)),
- framework::dataset::make("Beta", { 1.0f, 2.0f })),
- framework::dataset::make("Axis", { 0, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+ combine(
+ datasets::SoftmaxLayerSmallShapes(),
+ make("DataType", DataType::F32),
+ make("Beta", { 1.0f, 2.0f }),
+ make("Axis", { 0, -1 })))
{
// Validate output
validate(Accessor(_target), _reference, tolerance_f32);
}
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
- framework::dataset::make("DataType", DataType::F32)),
- framework::dataset::make("Beta", { 1.0f, 2.0f })),
- framework::dataset::make("Axis", { 0, -2, 3 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+ combine(datasets::Small4DShapes(),
+ make("DataType", DataType::F32),
+ make("Beta", { 1.0f, 2.0f }),
+ make("Axis", { 0, -2, 3 })))
{
// Validate output
validate(Accessor(_target), _reference, tolerance_f32);
}
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
- framework::dataset::make("DataType", DataType::F32)),
- framework::dataset::make("Beta", { 1.0f, 2.0f })),
- framework::dataset::make("Axis", { 0 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+ combine(datasets::SoftmaxLayerLargeShapes(),
+ make("DataType", DataType::F32),
+ make("Beta", { 1.0f, 2.0f }),
+ make("Axis", { 0 })))
{
// Validate output
validate(Accessor(_target), _reference, tolerance_f32);
@@ -252,29 +234,40 @@ using NESoftmaxLayerQuantizedFixture = SoftmaxValidationQuantizedFixture<Tensor,
TEST_SUITE(Quantized)
TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
- framework::dataset::make("DataType", DataType::QASYMM8)),
- combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
- framework::dataset::make("Beta", { 1.0f, 2.f }))),
- framework::dataset::make("Axis", { 0, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+ combine(
+ datasets::SoftmaxLayerSmallShapes(),
+ make("DataType", DataType::QASYMM8),
+ combine(
+ make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+ make("Beta", { 1.0f, 2.f })
+ ),
+ make("Axis", { 0, -1 })))
{
// Validate output
validate(Accessor(_target), _reference, tolerance_qasymm8);
}
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::Small4DShapes(),
- framework::dataset::make("DataType", DataType::QASYMM8)),
- combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
- framework::dataset::make("Beta", { 1.0f, 2.f }))),
- framework::dataset::make("Axis", { 0, 1, -2 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+ combine(
+ datasets::Small4DShapes(),
+ make("DataType", DataType::QASYMM8),
+ combine(
+ make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+ make("Beta", { 1.0f, 2.f })),
+ make("Axis", { 0, 1, -2 })))
{
// Validate output
validate(Accessor(_target), _reference, tolerance_qasymm8);
}
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
- framework::dataset::make("DataType", DataType::QASYMM8)),
- combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
- framework::dataset::make("Beta", { 1.0f, 2.0f }))),
- framework::dataset::make("Axis", { 0 })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+ combine(
+ datasets::SoftmaxLayerLargeShapes(),
+ make("DataType", DataType::QASYMM8),
+ combine(
+ make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+ make("Beta", { 1.0f, 2.0f })
+ ),
+ make("Axis", { 0 })))
{
// Validate output
validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -282,20 +275,28 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framew
TEST_SUITE_END() //QASYMM8
TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
- framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
- combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
- framework::dataset::make("Beta", { 1.0f, 2.f }))),
- framework::dataset::make("Axis", { 0, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+ combine(
+ datasets::SoftmaxLayerSmallShapes(),
+ make("DataType", DataType::QASYMM8_SIGNED),
+ combine(
+ make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+ make("Beta", { 1.0f, 2.f })
+ ),
+ make("Axis", { 0, -1 })))
{
// Validate output
validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
}
-FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::Small4DShapes(),
- framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
- combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
- framework::dataset::make("Beta", { 1.0f, 2.f }))),
- framework::dataset::make("Axis", { 0, 1, -1 })))
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+ combine(
+ datasets::Small4DShapes(),
+ make("DataType", DataType::QASYMM8_SIGNED),
+ combine(
+ make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+ make("Beta", { 1.0f, 2.f })
+ ),
+ make("Axis", { 0, 1, -1 })))
{
// Validate output
validate(Accessor(_target), _reference, tolerance_qasymm8_signed);