From fadc9b1e0bba90d6a91beb65466b2a0895b3a5e4 Mon Sep 17 00:00:00 2001 From: Gunes Bayir Date: Tue, 7 Nov 2023 05:43:07 +0000 Subject: Optimize CpuSoftmaxKernel for axis=0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement a single kernel instead of having two consecutive ones. In the previous setup, one kernel was calculating the maximum value in the axis, and this maximum was being subtracted from each data while calculating the softmax, i.e. softmax(x_i) = exp(x_i - max) / sum_i( exp(x_i - max) ) This patch integrates these two stages into a single kernel for Neon™ for all data types. This will save some memory because we don't need to hold the max values in a separate auxiliary tensor. It also introduces some other optimizations that will ease memory pressure when the data type is float/half, by using the dst tensor as temporary storage for already exponentiated inputs. It removes the references to SVE and SVE2 implementations, and most of the associated files; but, it leaves the implementations as these may be used in the future. Resolves: COMPMID-6500 Signed-off-by: Gunes Bayir Change-Id: Icff9976d1214c4c6cbe15a62ca60b8a77d3784cc Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10688 Reviewed-by: SiCong Li Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Benchmark: Arm Jenkins --- tests/validation/NEON/SoftmaxLayer.cpp | 299 +++++++++++++++++---------------- 1 file changed, 150 insertions(+), 149 deletions(-) (limited to 'tests') diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp index b372bdf3fa..2397d81547 100644 --- a/tests/validation/NEON/SoftmaxLayer.cpp +++ b/tests/validation/NEON/SoftmaxLayer.cpp @@ -22,14 +22,12 @@ * SOFTWARE. */ #include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h" #include "arm_compute/runtime/Tensor.h" #include "arm_compute/runtime/TensorAllocator.h" #include "src/common/cpuinfo/CpuIsaInfo.h" #include "src/cpu/kernels/CpuSoftmaxKernel.h" #include "tests/NEON/Accessor.h" -#include "tests/PaddingCalculator.h" #include "tests/datasets/ShapeDatasets.h" #include "tests/framework/Asserts.h" #include "tests/framework/Macros.h" @@ -42,6 +40,7 @@ namespace test { namespace validation { +using framework::dataset::make; namespace { /** Tolerance for float operations */ @@ -53,7 +52,7 @@ constexpr AbsoluteTolerance tolerance_qasymm8(1); constexpr AbsoluteTolerance tolerance_qasymm8_signed(1); /** CNN data types */ -const auto CNNDataTypes = framework::dataset::make("DataType", +const auto CNNDataTypes = make("DataType", { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC DataType::F16, @@ -66,53 +65,53 @@ TEST_SUITE(NEON) TEST_SUITE(SoftmaxLayer) // *INDENT-OFF* // clang-format off -DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip( - framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching data types - TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching shapes - TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info - QuantizationInfo(1.f/256, 12)), - TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), - TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, - QuantizationInfo(1.f/256, 12)), - TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), - TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, //Invalid axis high - QuantizationInfo(1.f/256, 12)), - TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, //Invalid axis low - QuantizationInfo(1.f/256, 12)), - }), - framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16), - TensorInfo(TensorShape(27U, 11U), 1, DataType::F32), - TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, - QuantizationInfo(1.f/256, 12)), - TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), - TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, - QuantizationInfo(1.f/256, 0)), - TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), - TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, - QuantizationInfo(1.f/256, 0)), - TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, - QuantizationInfo(1.f/256, 0)), - })), - framework::dataset::make("beta", { 1.0, - 2.0, - 1.0, - 2.0, - 1.0, - 1.0, - 2.0, - 1.0, - })), - framework::dataset::make("axis", { 0, - 0, - 0, - 1, - 0, - -1, - 2, - -3, - })), - framework::dataset::make("Expected", { false, false, false, true, true, true, false, false })), - input_info, output_info, beta, axis, expected) +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip( + make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching data types + TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching shapes + TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info + QuantizationInfo(1.f/256, 12)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, + QuantizationInfo(1.f/256, 12)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, //Invalid axis high + QuantizationInfo(1.f/256, 12)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, //Invalid axis low + QuantizationInfo(1.f/256, 12)), + }), + make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16), + TensorInfo(TensorShape(27U, 11U), 1, DataType::F32), + TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, + QuantizationInfo(1.f/256, 12)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, + QuantizationInfo(1.f/256, 0)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, + QuantizationInfo(1.f/256, 0)), + TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8, + QuantizationInfo(1.f/256, 0)), + }), + make("beta", { 1.0, + 2.0, + 1.0, + 2.0, + 1.0, + 1.0, + 2.0, + 1.0, + }), + make("axis", { 0, + 0, + 0, + 1, + 0, + -1, + 2, + -3, + }), + make("Expected", { false, false, false, true, true, true, false, false })), + input_info, output_info, beta, axis, expected) { ARM_COMPUTE_EXPECT(bool(NESoftmaxLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), beta, axis)) == expected, framework::LogLevel::ERRORS); } @@ -122,54 +121,26 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip( template using NESoftmaxLayerFixture = SoftmaxValidationFixture; -DATA_TEST_CASE(KernelSelection_max_logits, framework::DatasetMode::ALL, concat( - combine(framework::dataset::make("CpuExt", std::string("NEON")), - framework::dataset::make("DataType", { DataType::F32, - DataType::F16, - DataType::QASYMM8, - DataType::QASYMM8_SIGNED - })), - combine(framework::dataset::make("CpuExt", std::string("SVE")), - framework::dataset::make("DataType", { DataType::F32, - DataType::F16, - DataType::QASYMM8, - DataType::QASYMM8_SIGNED - }))), - cpu_ext, data_type) -{ - using namespace cpu::kernels; - - cpuinfo::CpuIsaInfo cpu_isa{}; - cpu_isa.neon = (cpu_ext == "NEON"); - cpu_isa.sve = (cpu_ext == "SVE"); - cpu_isa.fp16 = (data_type == DataType::F16); - - const auto *selected_impl = CpuLogits1DMaxKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred); - - ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl); - - std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_logits_1d_max"; - std::string actual = selected_impl->name; - - ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS); -} - -DATA_TEST_CASE(KernelSelection_logits, framework::DatasetMode::ALL, concat(concat( - combine(framework::dataset::make("CpuExt", std::string("NEON")), - framework::dataset::make("DataType", { DataType::F32, - DataType::F16, - DataType::QASYMM8, - DataType::QASYMM8_SIGNED - })), - combine(framework::dataset::make("CpuExt", std::string("SVE")), - framework::dataset::make("DataType", { DataType::F32, - DataType::F16 - }))), - combine(framework::dataset::make("CpuExt", std::string("SVE2")), - framework::dataset::make("DataType", { DataType::QASYMM8, - DataType::QASYMM8_SIGNED - }))), - cpu_ext, data_type) +DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL, + concat(concat( + combine( + make("CpuExt", std::string("NEON")), + make("DataType", { DataType::F32, + DataType::F16, + DataType::QASYMM8, + DataType::QASYMM8_SIGNED}) + ), + combine( + make("CpuExt", std::string("SVE")), + make("DataType", { DataType::F32, + DataType::F16})) + ), + combine( + make("CpuExt", std::string("SVE2")), + make("DataType", { DataType::QASYMM8, + DataType::QASYMM8_SIGNED})) + ), + cpu_ext, data_type) { using namespace cpu::kernels; @@ -179,11 +150,12 @@ DATA_TEST_CASE(KernelSelection_logits, framework::DatasetMode::ALL, concat(conca cpu_isa.sve2 = (cpu_ext == "SVE2"); cpu_isa.fp16 = (data_type == DataType::F16); - const auto *selected_impl = CpuLogits1DSoftmaxKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred); + const auto *selected_impl = CpuSoftmaxKernel::get_implementation( + SoftmaxKernelDataTypeISASelectorData{ data_type, cpu_isa, false /* is_log */ }, cpu::KernelSelectionType::Preferred); ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl); - std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_softmax_logits_1d"; + std::string expected = "neon_" + cpu_impl_dt(data_type) + "_softmax"; std::string actual = selected_impl->name; ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS); @@ -192,26 +164,32 @@ DATA_TEST_CASE(KernelSelection_logits, framework::DatasetMode::ALL, concat(conca TEST_SUITE(Float) #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC TEST_SUITE(FP16) -FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(), - framework::dataset::make("DataType", DataType::F16)), - framework::dataset::make("Beta", { 1.0f, 2.0f })), - framework::dataset::make("Axis", { 0, 1 }))) +FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture, framework::DatasetMode::PRECOMMIT, + combine( + datasets::Small4DShapes(), + make("DataType", DataType::F16), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0, 1 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f16); } -FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(), - framework::dataset::make("DataType", DataType::F16)), - framework::dataset::make("Beta", { 1.0f, 2.0f })), - framework::dataset::make("Axis", { 0, 2, -1 }))) +FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture, framework::DatasetMode::PRECOMMIT, + combine( + datasets::Small4DShapes(), + make("DataType", DataType::F16), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0, 2, -1 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f16); } -FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(), - framework::dataset::make("DataType", DataType::F16)), - framework::dataset::make("Beta", { 1.0f, 2.0f })), - framework::dataset::make("Axis", { 0 }))) +FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture, framework::DatasetMode::NIGHTLY, + combine( + datasets::SoftmaxLayerLargeShapes(), + make("DataType", DataType::F16), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f16); @@ -220,26 +198,30 @@ TEST_SUITE_END() //FP16 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ TEST_SUITE(FP32) -FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(), - framework::dataset::make("DataType", DataType::F32)), - framework::dataset::make("Beta", { 1.0f, 2.0f })), - framework::dataset::make("Axis", { 0, -1 }))) +FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture, framework::DatasetMode::PRECOMMIT, + combine( + datasets::SoftmaxLayerSmallShapes(), + make("DataType", DataType::F32), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0, -1 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f32); } -FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(), - framework::dataset::make("DataType", DataType::F32)), - framework::dataset::make("Beta", { 1.0f, 2.0f })), - framework::dataset::make("Axis", { 0, -2, 3 }))) +FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture, framework::DatasetMode::PRECOMMIT, + combine(datasets::Small4DShapes(), + make("DataType", DataType::F32), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0, -2, 3 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f32); } -FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(), - framework::dataset::make("DataType", DataType::F32)), - framework::dataset::make("Beta", { 1.0f, 2.0f })), - framework::dataset::make("Axis", { 0 }))) +FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture, framework::DatasetMode::NIGHTLY, + combine(datasets::SoftmaxLayerLargeShapes(), + make("DataType", DataType::F32), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f32); @@ -252,29 +234,40 @@ using NESoftmaxLayerQuantizedFixture = SoftmaxValidationQuantizedFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(), - framework::dataset::make("DataType", DataType::QASYMM8)), - combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), - framework::dataset::make("Beta", { 1.0f, 2.f }))), - framework::dataset::make("Axis", { 0, -1 }))) +FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture, framework::DatasetMode::ALL, + combine( + datasets::SoftmaxLayerSmallShapes(), + make("DataType", DataType::QASYMM8), + combine( + make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.f }) + ), + make("Axis", { 0, -1 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8); } -FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::Small4DShapes(), - framework::dataset::make("DataType", DataType::QASYMM8)), - combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), - framework::dataset::make("Beta", { 1.0f, 2.f }))), - framework::dataset::make("Axis", { 0, 1, -2 }))) +FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture, framework::DatasetMode::ALL, + combine( + datasets::Small4DShapes(), + make("DataType", DataType::QASYMM8), + combine( + make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.f })), + make("Axis", { 0, 1, -2 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8); } -FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(), - framework::dataset::make("DataType", DataType::QASYMM8)), - combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), - framework::dataset::make("Beta", { 1.0f, 2.0f }))), - framework::dataset::make("Axis", { 0 }))) +FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture, framework::DatasetMode::NIGHTLY, + combine( + datasets::SoftmaxLayerLargeShapes(), + make("DataType", DataType::QASYMM8), + combine( + make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.0f }) + ), + make("Axis", { 0 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8); @@ -282,20 +275,28 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture, framew TEST_SUITE_END() //QASYMM8 TEST_SUITE(QASYMM8_SIGNED) -FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(), - framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), - combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), - framework::dataset::make("Beta", { 1.0f, 2.f }))), - framework::dataset::make("Axis", { 0, -1 }))) +FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture, framework::DatasetMode::ALL, + combine( + datasets::SoftmaxLayerSmallShapes(), + make("DataType", DataType::QASYMM8_SIGNED), + combine( + make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.f }) + ), + make("Axis", { 0, -1 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8_signed); } -FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::Small4DShapes(), - framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), - combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), - framework::dataset::make("Beta", { 1.0f, 2.f }))), - framework::dataset::make("Axis", { 0, 1, -1 }))) +FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture, framework::DatasetMode::ALL, + combine( + datasets::Small4DShapes(), + make("DataType", DataType::QASYMM8_SIGNED), + combine( + make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.f }) + ), + make("Axis", { 0, 1, -1 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_qasymm8_signed); -- cgit v1.2.1