/* * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/runtime/NEON/functions/NEGEMM.h" #include "arm_compute/runtime/Tensor.h" #include "arm_compute/runtime/TensorAllocator.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h" #include "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h" #include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h" #include "src/cpu/operators/CpuGemm.h" #include "tests/NEON/Accessor.h" #include "tests/NEON/Helper.h" #include "tests/PaddingCalculator.h" #include "tests/datasets/LargeGEMMDataset.h" #include "tests/datasets/SmallGEMMDataset.h" #include "tests/datasets/TinyGEMMDataset.h" #include "tests/framework/Asserts.h" #include "tests/framework/Macros.h" #include "tests/framework/datasets/Datasets.h" #include "tests/validation/Validation.h" #include "tests/validation/fixtures/GEMMFixture.h" #include "tests/validation/fixtures/GEMMInterleave4x4Fixture.h" #include "tests/validation/fixtures/GEMMTranspose1xWFixture.h" namespace arm_compute { namespace test { namespace validation { using framework::dataset::make; namespace { constexpr AbsoluteTolerance tolerance_f(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for FP32 data types */ #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC RelativeTolerance rel_tolerance_f16(half(0.2)); /**< Relative tolerance value for comparing reference's output against implementation's output for FP16 data types */ const AbsoluteTolerance abs_tolerance_f16(0.2f); /**< Absolute tolerance value for comparing reference's output against implementation's output for FP16 data types */ constexpr float tolerance_num = 0.07f; /**< Tolerance number for FP16 data types */ #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ /** CNN data types */ const auto CNNDataTypes = make("DataType", { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC DataType::F16, #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ DataType::F32, }); const auto data_interleave = make("M", 8, 12) * make("N", 8, 12); const auto data_transpose = make("M", 8, 14) * make("N", 7, 14); /** Zero padding test */ template bool validate_zero_padding(unsigned int dim0_value, unsigned int dim1_value) { const TensorShape in_shape(dim0_value, dim1_value); TensorInfo in(in_shape, 1, DataType::U32); TensorInfo dst; ARM_COMPUTE_EXPECT(in.is_resizable(), framework::LogLevel::ERRORS); // Validate zero-padding FunctionType func; func.configure(&in, &dst); return in.padding().empty(); } /* Zero padding test for GEMM kernels */ bool validate_gemm_zero_padding(const TensorShape shape0, const TensorShape shape1) { // Create tensors TensorInfo in0(shape0, 1, DataType::F32); TensorInfo in1(shape1, 1, DataType::F32); TensorInfo dst; // Validate zero-padding cpu::kernels::CpuGemmMatrixMultiplyKernel gemm; gemm.configure(&in0, &in1, &dst, 1.0, false); return in0.padding().empty() && in1.padding().empty() && dst.padding().empty(); } } // namespace TEST_SUITE(NEON) TEST_SUITE(GEMM) /** Test case for memory injection in @ref cpu::CpuGemm. * * Configure the operator once and inject memory at run-time in multiple executions. * * Checks performed in order: * - Both runs compute the same output */ TEST_CASE(MemoryInjection, framework::DatasetMode::ALL) { auto gemm = std::make_unique(); const auto lhs_info = TensorInfo(TensorShape(3U, 3U), 1, DataType::F32); const auto rhs_info = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32); const auto c_info = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32); auto dst_info = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32); const auto gemm_info = GEMMInfo{}; gemm->configure(&lhs_info, &rhs_info, &c_info, &dst_info, 1.f, 1.f, gemm_info); // telhs are newly created every call of this lambda function auto lhs = create_tensor(lhs_info); auto rhs = create_tensor(rhs_info); auto c = create_tensor(c_info); lhs.allocator()->allocate(); rhs.allocator()->allocate(); c.allocator()->allocate(); ITensorPack run_pack{ { TensorType::ACL_SRC_0, &lhs }, { TensorType::ACL_SRC_1, &rhs }, { TensorType::ACL_SRC_2, &c } }; ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &rhs }, { TensorType::ACL_SRC_2, &c } }; auto mg = MemoryGroup{}; auto ws = manage_workspace(gemm->workspace(), mg, run_pack, prep_pack); auto run_conv = [&]() -> Tensor { auto dst = create_tensor(dst_info); dst.allocator()->allocate(); run_pack.add_tensor(TensorType::ACL_DST, &dst); library->fill_tensor_value(Accessor(lhs), 1.f); library->fill_tensor_value(Accessor(rhs), 2.f); library->fill_tensor_value(Accessor(c), 3.f); // This operator is configured once and captured by this lambda. gemm->prepare(prep_pack); gemm->run(run_pack); return dst; }; auto result_0 = run_conv(); auto result_1 = run_conv(); for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i) { ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS); } } /** Test case for memory injection in @ref NEGEMM. * * Make sure @ref NEGEMM still works through injecting the memory at configure time using the old API. * * Checks performed in order: * - Both runs compute the same output */ TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL) { auto gemm = std::make_unique(); const auto lhs_info = TensorInfo(TensorShape(3U, 3U), 1, DataType::F32); const auto rhs_info = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32); const auto c_info = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32); auto dst_info = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32); const auto gemm_info = GEMMInfo{}; auto run_conv = [&]() { auto lhs = create_tensor(lhs_info); auto rhs = create_tensor(rhs_info); auto c = create_tensor(c_info); auto dst = create_tensor(dst_info); gemm->configure(&lhs, &rhs, &c, &dst, 1.f, 1.f, gemm_info); lhs.allocator()->allocate(); rhs.allocator()->allocate(); c.allocator()->allocate(); dst.allocator()->allocate(); library->fill_tensor_value(Accessor(lhs), 1.f); library->fill_tensor_value(Accessor(rhs), 2.f); library->fill_tensor_value(Accessor(c), 3.f); gemm->run(); return dst; }; auto result_0 = run_conv(); auto result_1 = run_conv(); for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i) { ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS); } } // *INDENT-OFF* // clang-format off DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( make("LhsInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::S32), // Unsupported data type TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), }), make("RhsInfo",{ TensorInfo(TensorShape(8U, 27U), 1, DataType::S32), TensorInfo(TensorShape(8U, 27U), 1, DataType::F32), })), make("OutputInfo",{ TensorInfo(TensorShape(8U, 13U), 1, DataType::S32), TensorInfo(TensorShape(8U, 13U), 1, DataType::F32), })), make("Expected", { false, true })), lhs_info, rhs_info, output_info, expected) { constexpr float alpha = 1.0; constexpr float beta = 0.0; const auto gemm_info = GEMMInfo(); bool is_valid = bool(NEGEMM::validate(&lhs_info.clone()->set_is_resizable(true), &rhs_info.clone()->set_is_resizable(true), nullptr, &output_info.clone()->set_is_resizable(true), alpha, beta, gemm_info)); ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS); } // clang-format on // *INDENT-ON* TEST_SUITE(KERNEL_SELECTION) DATA_TEST_CASE(KernelSelection_mul_and_add, framework::DatasetMode::ALL, combine(make("CpuExt", std::string("NEON")), make("DataType", { DataType::F32, DataType::F16 })), cpu_ext, data_type) { using namespace cpu::kernels; cpuinfo::CpuIsaInfo cpu_isa{}; cpu_isa.neon = (cpu_ext == "NEON"); cpu_isa.fp16 = (data_type == DataType::F16); const auto *selected_impl_mul = CpuGemmMatrixMultiplyKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred); ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl_mul); std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_gemm_matrix_mul"; std::string actual = selected_impl_mul->name; ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS); const auto *selected_impl_add = CpuGemmMatrixAdditionKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred); ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl_add); expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_gemm_matrix_add"; actual = selected_impl_add->name; ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS); } TEST_SUITE_END() // KERNEL_SELECTION TEST_SUITE(TRANSPOSE_1XW) using CpuGemmTranspose1xW = NESynthetizeFunctionWithZeroConstantKernelBorder; DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip( make("N", { 1, 23, 63, 101 }), make("K", { 1, 47, 29, 27 })), n_value, k_value) { bool status = validate_zero_padding(n_value, k_value); ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS); } TEST_SUITE(U32) using CpuGemmTranspose1xWFixture = GEMMTranspose1xWValidationFixture; FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * make("DataType", DataType::U32)) { // Validate output validate(Accessor(_target), _reference); } TEST_SUITE_END() // U32 TEST_SUITE(U16) using CpuGemmTranspose1xWFixture = GEMMTranspose1xWValidationFixture; FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * make("DataType", DataType::U16)) { // Validate output validate(Accessor(_target), _reference); } TEST_SUITE_END() // U16 TEST_SUITE(U8) using CpuGemmTranspose1xWFixture = GEMMTranspose1xWValidationFixture; FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * make("DataType", DataType::U8)) { // Validate output validate(Accessor(_target), _reference); } TEST_SUITE_END() // U8 TEST_SUITE_END() // TRANSPOSE_1XW TEST_SUITE(INTERLEAVE_4X4) using CpuGemmInterleave4x4 = NESynthetizeFunctionWithZeroConstantKernelBorder; DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip( make("M", { 1, 23, 63, 101 }), make("K", { 1, 47, 29, 27 })), m_value, k_value) { bool status = validate_zero_padding(m_value, k_value); ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS); } TEST_SUITE(U32) using CpuGemmInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture; FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * make("DataType", DataType::U32)) { // Validate output validate(Accessor(_target), _reference); } TEST_SUITE_END() // U32 TEST_SUITE(U16) using CpuGemmInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture; FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * make("DataType", DataType::U16)) { // Validate output validate(Accessor(_target), _reference); } TEST_SUITE_END() // U16 TEST_SUITE(U8) using CpuGemmInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture; FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * make("DataType", DataType::QASYMM8)) { // Validate output validate(Accessor(_target), _reference); } TEST_SUITE_END() // U8 TEST_SUITE_END() // INTERLEAVE_4X4 template using NEGEMMFixture = GEMMValidationFixture; template using NEBatchedMatMulFixture = GEMMValidationFixture; template using NEGEMMAccumulateFixture = GEMMAccumulateValidationFixture; TEST_SUITE(Float) DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(make("In0", { TensorShape(21U, 13U), TensorShape(31U, 1U), TensorShape(31U, 1U), TensorShape(8U, 2U), TensorShape(38U, 12U), TensorShape(32U, 1U) }), make("In1", { TensorShape(33U, 21U), TensorShape(23U, 31U), TensorShape(23U, 31U), TensorShape(16U, 8U), TensorShape(21U, 38U), TensorShape(17U, 32U) })), shape0, shape1) { bool status = validate_gemm_zero_padding(shape0, shape1); ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS); } DATA_TEST_CASE(ValidateAccumulate, framework::DatasetMode::ALL, combine( zip(make("In0",{ TensorShape(21U, 13U) }), make("In1", { TensorShape(33U, 21U) }), make("Dst", { TensorShape(33U, 13U) })), zip( make("alpha", { 1.0, 100.0, 1.0, 1.0 }), make("beta", { 0.0, 0.0, 1.0, 1.0 }), make("is_c_null", { false, false, false, true }), make("Expected", { true, false, false, true }))), shape_a, shape_b, shape_dst, alpha, beta, is_c_null, expected) { /* Accumulation test for GEMM kernels */ // Create tensors TensorInfo in_a(shape_a, 1, DataType::F32); TensorInfo in_b(shape_b, 1, DataType::F32); TensorInfo in_c(shape_dst, 1, DataType::F32); TensorInfo dst(shape_dst, 1, DataType::F32); GEMMInfo gemm_info = GEMMInfo(); gemm_info.set_accumulate(true); // Validate accumulation cpu::CpuGemm gemm; Status status = gemm.validate(&in_a, &in_b, (is_c_null ? nullptr : &in_c), &dst, alpha, beta, gemm_info); ARM_COMPUTE_EXPECT((expected == bool(status)), framework::LogLevel::ERRORS); } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC TEST_SUITE(FP16) FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMFixture, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallGEMMDataset(), make("ReshapeWeights", { true, false })), make("DataType", DataType::F16))) { // Validate output validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16); } FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMFixture, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeGEMMDataset(), make("ReshapeWeights", { true, false })), make("DataType", DataType::F16))) { // Validate output validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16); } TEST_SUITE(BATCHED_MATMUL) FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchedMatMulFixture, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallBatchedMatMulDataset(), make("ReshapeWeights", { false })), make("DataType", DataType::F16))) { // Validate output validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16); } TEST_SUITE_END() // BATCHED_MATMUL TEST_SUITE_END() // FP16 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ TEST_SUITE(FP32) FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMFixture, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallGEMMDataset(), make("ReshapeWeights", { true, false })), make("DataType", DataType::F32))) { // Validate output validate(Accessor(_target), _reference, tolerance_f); } FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMFixture, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeGEMMDataset(), make("ReshapeWeights", { true, false })), make("DataType", DataType::F32))) { // Validate output validate(Accessor(_target), _reference, tolerance_f); } TEST_SUITE(BATCHED_MATMUL) FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchedMatMulFixture, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallBatchedMatMulDataset(), make("ReshapeWeights", { false })), make("DataType", DataType::F32))) { // Validate output validate(Accessor(_target), _reference, tolerance_f); } TEST_SUITE_END() // BATCHED_MATMUL TEST_SUITE(ACCUMULATE) FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMAccumulateFixture, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallAccumulateGEMMDataset(), make("ReshapeWeights", { false }), make("DataType", DataType::F32))) { // Validate output validate(Accessor(_target), _reference, tolerance_f); } FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMAccumulateFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeAccumulateGEMMDataset(), make("ReshapeWeights", { false }), make("DataType", DataType::F32))) { // Validate output validate(Accessor(_target), _reference, tolerance_f); } TEST_SUITE_END() // ACCUMULATE TEST_SUITE_END() // FP32 TEST_SUITE_END() // Float TEST_SUITE_END() // GEMM TEST_SUITE_END() // NEON } // namespace validation } // namespace test } // namespace arm_compute