From fde45d836cf753a94915ac42d8a13da7edc52221 Mon Sep 17 00:00:00 2001 From: Adnan AlSinan Date: Tue, 24 Oct 2023 12:03:21 +0100 Subject: Extend CKW MatMul with nt_t - Add the kernel variant: (nt_t) to GpuCKWMatMul. - Extend CKW MatMul validation test with nt_t. - Fixes a bug in CKW where z-dim = 1. Resolves: COMPMID-6435 Signed-off-by: Adnan AlSinan Change-Id: I4c5e8791e55f21ffff3c11eca7802c51a4259977 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10525 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gian Marco Iodice Benchmark: Arm Jenkins --- tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp | 350 +++++++++++++++++++++ .../dynamic_fusion/gpu/cl/MatMulKernelFixture.h | 267 ++++++++++++++++ 2 files changed, 617 insertions(+) create mode 100644 tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp create mode 100644 tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h (limited to 'tests') diff --git a/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp b/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp new file mode 100644 index 0000000000..38c3a0ca0e --- /dev/null +++ b/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef ACL_INTERNAL_TEST_CKW_IN_DF +#include "tests/AssetsLibrary.h" +#include "tests/CL/CLAccessor.h" +#include "tests/framework/Fixture.h" +#include "tests/framework/Macros.h" +#include "tests/framework/datasets/Datasets.h" +#include "tests/datasets/LargeMatMulDataset.h" +#include "tests/datasets/SmallMatMulDataset.h" +#include "tests/validation/Validation.h" +#include "tests/validation/reference/Permute.h" +#include "tests/validation/reference/GEMM.h" + +#include "tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h" + +#include + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +namespace +{ + RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ +constexpr float abs_tolerance_f32( + 0.0001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for floating point data types in case using relative tolerance fails because of small values */ +constexpr float abs_tolerance_f16( + 0.001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp16 data types in case using relative tolerance fails because of small values */ + RelativeTolerance tolerance_f16(half(0.02)); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ +} + +/** M0 values to test --precommit*/ +const auto m0_values_precommit = framework::dataset::make("M0", { 1, 3 }); + +/** N0 values to test --precommit*/ +const auto n0_values_precommit = framework::dataset::make("N0", { 1, 2, 4 }); + +/** K0 values to test --precommit*/ +const auto k0_values_precommit = framework::dataset::make("K0", { 1, 2, 3 }); + +/** M0 values to test --nightly*/ +const auto m0_values_nightly_lhs_nt = framework::dataset::make("M0", { 1, 2, 3, 4, 5, 6, 7, 8 }); +const auto m0_values_nightly_lhs_t = framework::dataset::make("M0", { 1, 2, 3, 4, 8 }); + +/** N0 values to test --nightly*/ +const auto n0_values_nightly_rhs_nt = framework::dataset::make("N0", { 1, 2, 3, 4, 8, 16 }); +const auto n0_values_nightly_rhs_t = framework::dataset::make("N0", { 1, 2, 3, 4, 8 }); + +/** K0 values to test --nightly*/ +const auto k0_values_nightly_lhs_nt_rhs_nt = framework::dataset::make("K0", { 1, 2, 3, 4, 8, 16 }); +const auto k0_values_nightly_rhs_t = framework::dataset::make("K0", { 1, 2, 3, 4, 8 }); +const auto k0_values_nightly_lhs_t_rhs_nt = framework::dataset::make("K0", { 1, 2, 3, 4, 5, 6, 7, 8 }); + +TEST_SUITE(CL) +TEST_SUITE(DYNAMIC_FUSION) + +TEST_SUITE(MatMul) + +TEST_SUITE(Validate) +TEST_CASE(SupportedBlockSizes, framework::DatasetMode::ALL) +{ + using MatMulConfigurationPair = std::pair; + + const std::vector supported_block_sizes = + { + // MatMulKernelInfo(adj_lhs, adj_rhs, M0, N0, K0, export_rhs_to_cl_image = false) + + // Lhs not-transposed, Rhs transposed + { MatMulKernelInfo(false, true, 0, 1, 1), false }, // M0 should be > 0 + { MatMulKernelInfo(false, true, 3, 11, 1), false }, // N0 not in {1, 2, 3, 4, 8, 16} + { MatMulKernelInfo(false, true, 3, 7, 1), false }, // N0 not in {1, 2, 3, 4, 8, 16} + { MatMulKernelInfo(false, true, 3, 3, 12), false }, // K0 not in {1, 2, 3, 4, 8, 16} + { MatMulKernelInfo(false, true, 3, 3, 6), false }, // K0 not in {1, 2, 3, 4, 8, 16} + { MatMulKernelInfo(false, true, 5, 1, 2), true }, + { MatMulKernelInfo(false, true, 3, 3, 3), true }, + { MatMulKernelInfo(false, true, 2, 4, 8), true }, + + }; + + // Create a new workload sketch + auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); + auto context = GpuWorkloadContext{ &cl_compile_ctx }; + GpuWorkloadSketch sketch{ &context }; + + // Set big enough shapes so that block sizes are not truncated. Also, set all dimensions equal + // so that it doesn't fail for different NT/T configurations. We aim to test the block sizes here, + // not the shapes themselves. + const TensorInfo lhs_info = context.create_tensor_info(TensorInfo(TensorShape(100U, 100U), 1, DataType::F32)); + const TensorInfo rhs_info = context.create_tensor_info(TensorInfo(TensorShape(100U, 100U), 1, DataType::F32)); + + for(auto &pair : supported_block_sizes) + { + MatMulAttributes matmul_attr {}; + matmul_attr.adj_lhs(pair.first.adj_lhs); + matmul_attr.adj_rhs(pair.first.adj_rhs); + + GpuMatMulSettings matmul_settings {}; + matmul_settings.m0(pair.first.m0); + matmul_settings.n0(pair.first.n0); + matmul_settings.k0(pair.first.k0); + + Status status = GpuMatMul::validate_op(sketch, &lhs_info, &rhs_info, matmul_attr, matmul_settings); + ARM_COMPUTE_EXPECT(bool(status) == pair.second, framework::LogLevel::ERRORS); + } +} + +TEST_CASE(ValidateInputShapes, framework::DatasetMode::ALL) +{ + // Create a sketch + auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); + auto context = GpuWorkloadContext{ &cl_compile_ctx }; + GpuWorkloadSketch sketch{ &context }; + + // Configurations are assumed to be Nt/Nt, but will be transposed inside the test to test other configurations + using ShapeConfigurationTuple = std::tuple; + const std::vector shape_configurations = + { + { TensorShape(5U, 1U), TensorShape(3U, 5U), true }, + { TensorShape(10U, 12U), TensorShape(3U, 10U), true }, + { TensorShape(8U, 4U), TensorShape(2U, 8U), true }, + { TensorShape(8U, 4U), TensorShape(2U, 5U), false }, // Mismatch in the K dimension + { TensorShape(5U, 0U), TensorShape(2U, 5U), false }, // Invalid dimension + { TensorShape(5U, 4U, 3U, 4U, 5U, 6U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), true }, + { TensorShape(5U, 4U, 3U, 4U, 5U, 1U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), false }, // no batch broadcasting + { TensorShape(5U, 4U, 3U, 4U, 9U, 6U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), false }, // mismatch in batch dimension + }; + + for(auto &tuple : shape_configurations) + { + const bool expected = std::get<2>(tuple); + + for(bool adj_lhs : + { + false + }) + { + for(bool adj_rhs : + { + true + }) + { + TensorShape lhs_shape = std::get<0>(tuple); + TensorShape rhs_shape = std::get<1>(tuple); + + if(adj_lhs) + { + permute(lhs_shape, PermutationVector(1U, 0U)); + } + + if(adj_rhs) + { + permute(rhs_shape, PermutationVector(1U, 0U)); + } + + const TensorInfo lhs_info = context.create_tensor_info(TensorInfo(lhs_shape, 1, DataType::F32)); + const TensorInfo rhs_info = context.create_tensor_info(TensorInfo(rhs_shape, 1, DataType::F32)); + + MatMulAttributes matmul_attr {}; + matmul_attr.adj_lhs(adj_lhs); + matmul_attr.adj_rhs(adj_rhs); + + GpuMatMulSettings matmul_settings {}; + matmul_settings.m0(1); + matmul_settings.n0(1); + matmul_settings.k0(1); + + Status status = GpuMatMul::validate_op(sketch, &lhs_info, &rhs_info, matmul_attr, matmul_settings); + ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS); + } + } + } +} + + +TEST_CASE(ValidateDataTypes, framework::DatasetMode::ALL) +{ + // Configurations are assumed to be Nt/Nt, but will be transposed inside the test to test other configurations + using DataTypeConfigurationTuple = std::tuple; + const std::vector data_type_configurations = + { + { DataType::F32, DataType::F32, DataType::F32, true }, + { DataType::F16, DataType::F16, DataType::F16, true }, + { DataType::F16, DataType::F32, DataType::F32, false }, // no mixed precision + { DataType::F64, DataType::F64, DataType::F64, false }, // no double precision + { DataType::QASYMM8, DataType::QASYMM8, DataType::QASYMM8, false }, // no quantized types + { DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, false }, // no quantized types + { DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8_PER_CHANNEL, false }, // no quantized types + { DataType::QASYMM16, DataType::QASYMM16, DataType::QASYMM16, false }, // no quantized types + { DataType::QSYMM16, DataType::QSYMM16, DataType::QSYMM16, false }, // no quantized types + { DataType::QSYMM8, DataType::QSYMM8, DataType::QSYMM8, false }, // no quantized types + { DataType::S64, DataType::S64, DataType::S64, false }, // no integral types + { DataType::S32, DataType::S32, DataType::S32, false }, // no integral types + { DataType::S16, DataType::S16, DataType::S16, false }, // no integral types + { DataType::S8, DataType::S8, DataType::S8, false }, // no integral types + { DataType::U64, DataType::U64, DataType::U64, false }, // no integral types + { DataType::U32, DataType::U32, DataType::U32, false }, // no integral types + { DataType::U16, DataType::U16, DataType::U16, false }, // no integral types + { DataType::U8, DataType::U8, DataType::U8, false }, // no integral types + }; + // Create a sketch + auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); + auto context = GpuWorkloadContext{ &cl_compile_ctx }; + GpuWorkloadSketch sketch{ &context }; + + const TensorShape shape = TensorShape(10U, 10U); + MatMulAttributes matmul_attr {}; + matmul_attr.adj_lhs(false); + matmul_attr.adj_rhs(false); + GpuMatMulSettings matmul_settings {}; + matmul_settings.m0(1); + matmul_settings.n0(1); + matmul_settings.k0(1); + + for(auto &tuple : data_type_configurations) + { + const bool expected = std::get<3>(tuple); + + const TensorInfo lhs_info = context.create_tensor_info(TensorInfo(shape, 1, std::get<0>(tuple))); + const TensorInfo rhs_info = context.create_tensor_info(TensorInfo(shape, 1, std::get<1>(tuple))); + + Status status = GpuMatMul::validate_op(sketch, &lhs_info, &rhs_info, matmul_attr, matmul_settings); + ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS); + } +} + +TEST_SUITE_END() // Validate + +template +using DynamicFusionGpuMatmulFixture = DynamicFusionGpuMatMulValidationFixture; + +TEST_SUITE(Float) +TEST_SUITE(FP32) + +FIXTURE_DATA_TEST_CASE(RunTiny, DynamicFusionGpuMatmulFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::TinyMatMulDataset(), + framework::dataset::make("TransposeA", { false })), + framework::dataset::make("TransposeB", { true })), + m0_values_precommit), + n0_values_precommit), + k0_values_precommit), + framework::dataset::make("ExportRhsToCLImage", { false })), + framework::dataset::make("DataType", DataType::F32))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32); +} + +FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuMatmulFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulDataset(), + framework::dataset::make("TransposeA", { false })), + framework::dataset::make("TransposeB", { true })), + m0_values_precommit), + n0_values_precommit), + k0_values_precommit), + framework::dataset::make("ExportRhsToCLImage", { false })), + framework::dataset::make("DataType", DataType::F32))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32); +} + +FIXTURE_DATA_TEST_CASE(RunLargeRhsTransposed, DynamicFusionGpuMatmulFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(), + framework::dataset::make("TransposeA", { false })), + framework::dataset::make("TransposeB", { true })), + m0_values_nightly_lhs_nt), + n0_values_nightly_rhs_t), + k0_values_nightly_rhs_t), + framework::dataset::make("ExportRhsToCLImage", { false })), + framework::dataset::make("DataType", DataType::F32))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32); +} + +// Running High Dimensional test is enough for FP32, because we're stressing the number of dimensions, not data type or M0/N0/K0 +FIXTURE_DATA_TEST_CASE(RunHighDimensional, DynamicFusionGpuMatmulFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::HighDimensionalMatMulDataset(), + framework::dataset::make("TransposeA", { false })), + framework::dataset::make("TransposeB", { true })), + framework::dataset::make("M0", { 2 })), + framework::dataset::make("N0", { 2 })), + framework::dataset::make("K0", { 2 })), + framework::dataset::make("ExportRhsToCLImage", { false })), + framework::dataset::make("DataType", DataType::F32))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32); +} +TEST_SUITE_END() // FP32 + +TEST_SUITE(FP16) + +FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuMatmulFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulDataset(), + framework::dataset::make("TransposeA", { false })), + framework::dataset::make("TransposeB", { true })), + m0_values_precommit), + n0_values_precommit), + k0_values_precommit), + framework::dataset::make("ExportRhsToCLImage", { false })), + framework::dataset::make("DataType", DataType::F16))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16); +} + + +FIXTURE_DATA_TEST_CASE(RunLargeRhsTransposed, DynamicFusionGpuMatmulFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(), + framework::dataset::make("TransposeA", { false })), + framework::dataset::make("TransposeB", { true })), + m0_values_nightly_lhs_nt), + n0_values_nightly_rhs_t), + k0_values_nightly_rhs_t), + framework::dataset::make("ExportRhsToCLImage", { false })), + framework::dataset::make("DataType", DataType::F16))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16); +} + +TEST_SUITE_END() // FP16 + +TEST_SUITE_END() // Float +TEST_SUITE_END() // MatMul +TEST_SUITE_END() // DYNAMIC_FUSION +TEST_SUITE_END() // CL +} // namespace validation +} // namespace test +} // namespace arm_compute +#endif // ACL_INTERNAL_TEST_CKW_IN_DF diff --git a/tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h b/tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h new file mode 100644 index 0000000000..c6ac4b91db --- /dev/null +++ b/tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_MATMULKERNELFIXTURE_H +#define ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_MATMULKERNELFIXTURE_H + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h" +#include "arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h" +#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMatMul.h" +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h" + +#include "tests/CL/CLAccessor.h" +#include "tests/framework/Fixture.h" +#include "tests/framework/Macros.h" +#include "tests/validation/Helpers.h" +#include "tests/validation/Validation.h" +#include "tests/validation/reference/GEMM.h" +#include "tests/validation/reference/Permute.h" +#include "tests/validation/reference/ReshapeLayer.h" + +using namespace arm_compute::experimental::dynamic_fusion; + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +namespace +{ +template +void fill(U &&tensor, int i) +{ + switch(tensor.data_type()) + { + case DataType::F16: + { + arm_compute::utils::uniform_real_distribution_16bit distribution{ -1.0f, 1.0f }; + library->fill(tensor, distribution, i); + break; + } + case DataType::F32: + { + std::uniform_real_distribution distribution(-1.0f, 1.0f); + library->fill(tensor, distribution, i); + break; + } + default: + library->fill_tensor_uniform(tensor, i); + } +} + +} // namespace +template +class DynamicFusionGpuMatMulValidationGenericFixture : public framework::Fixture +{ + +public: + void setup(TensorShape lhs_shape, TensorShape rhs_shape, TensorShape output_shape, bool transpose_a, bool transpose_b, + int M0, int N0, int K0, bool export_rhs_to_cl_image, DataType data_type) + { + //For brevity, the input shapes are assumed to be not-transposed for both a and b matrices. + if(transpose_a) + { + permute(lhs_shape, PermutationVector(1U, 0U)); + } + if(transpose_b) + { + permute(rhs_shape, PermutationVector(1U, 0U)); + } + + // Skip configurations unsupported by the device. + _device_supports_export_to_cl_image = image2d_from_buffer_supported(CLKernelLibrary::get().get_device()); + if(!_device_supports_export_to_cl_image && export_rhs_to_cl_image) + { + ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped"); + framework::ARM_COMPUTE_PRINT_INFO(); + return; // Note: Also need to skip the validate in corresponding FIXTURE_DATA_TEST_CASEs. + } + + _target = compute_target(lhs_shape, rhs_shape, transpose_a, transpose_b, M0, N0, K0, export_rhs_to_cl_image, data_type); + _reference = compute_reference(lhs_shape, rhs_shape, output_shape, transpose_a, transpose_b, data_type); + } + +protected: + TensorType compute_target(TensorShape &shape_a, TensorShape &shape_b, bool transpose_a, bool transpose_b, int M0, int N0, int K0, bool export_rhs_to_cl_image, DataType data_type) + { + ARM_COMPUTE_UNUSED(export_rhs_to_cl_image); + CLScheduler::get().default_reinit(); + + // Create a new workload sketch + auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); + auto context = GpuWorkloadContext{ &cl_compile_ctx }; + GpuWorkloadSketch sketch{ &context }; + + // Create sketch tensors + TensorInfo lhs_info = context.create_tensor_info(TensorInfo(shape_a, 1, data_type)); + TensorInfo rhs_info = context.create_tensor_info(TensorInfo(shape_b, 1, data_type)); + TensorInfo dst_info = context.create_tensor_info(); + + MatMulAttributes matmul_attr {}; + matmul_attr.adj_lhs(transpose_a); + matmul_attr.adj_rhs(transpose_b); + + GpuMatMulSettings matmul_settings {}; + matmul_settings.m0(M0); + matmul_settings.n0(N0); + matmul_settings.k0(K0); + + ITensorInfo *ans_info = FunctionType::create_op(sketch, &lhs_info, &rhs_info, matmul_attr, matmul_settings); + GpuOutput::create_op(sketch, ans_info, &dst_info); + + // Configure runtime + ClWorkloadRuntime runtime; + runtime.configure(sketch); + + for(auto &data : runtime.get_auxiliary_tensors()) + { + CLTensor *tensor = std::get<0>(data); + TensorInfo info = std::get<1>(data); + AuxMemoryInfo aux_mem_req = std::get<2>(data); + tensor->allocator()->init(info, aux_mem_req.alignment); + tensor->allocator()->allocate(); // Use ACL allocated memory + } + + // Construct user tensors + TensorType t_lhs{}; + TensorType t_rhs{}; + TensorType t_dst{}; + + // Initialize user tensors + t_lhs.allocator()->init(lhs_info); + t_rhs.allocator()->init(rhs_info); + t_dst.allocator()->init(dst_info); + + ARM_COMPUTE_ASSERT(t_lhs.info()->is_resizable()); + ARM_COMPUTE_ASSERT(t_rhs.info()->is_resizable()); + ARM_COMPUTE_ASSERT(t_dst.info()->is_resizable()); + + // Allocate and fill user tensors + t_lhs.allocator()->allocate(); + t_rhs.allocator()->allocate(); + t_dst.allocator()->allocate(); + + ARM_COMPUTE_ASSERT(!t_lhs.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!t_rhs.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!t_dst.info()->is_resizable()); + + fill(AccessorType(t_lhs), 0); + fill(AccessorType(t_rhs), 1); + + // Run runtime + runtime.run({ &t_lhs, &t_rhs, &t_dst }); + + return t_dst; + } + + SimpleTensor compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &output_shape, bool pretranspose_a, bool pretranspose_b, DataType data_type) + { + // We collapse dimensions > 3 onto dimension 3, i.e. 5D+ tensors will look like 4D + // This is necessary unless we choose to extend gemm reference for 5D+ tensors + TensorShape output_shape_collapsed = output_shape.collapsed_from(Window::DimZ); + TensorShape shape_a_collapsed = shape_a.collapsed_from(Window::DimZ); + TensorShape shape_b_collapsed = shape_b.collapsed_from(Window::DimZ); + + // Create reference + SimpleTensor a{ shape_a_collapsed, data_type, 1 }; + SimpleTensor b{ shape_b_collapsed, data_type, 1 }; + SimpleTensor c{ output_shape_collapsed, data_type, 1 }; + + // Fill reference + fill(a, 0); + fill(b, 1); + + /* Note: Assuming the usual batch matmul dimensions A = (B x M x K), B = (B x K x N), if pretranspose_A is set to true, then A is assumed to be (B x K x M), + therefore, A must be pre-transposed before passing it to the fixture. And, we transpose A again in the fixture to make it (B x M x K) + in order to be able to call reference implementation that works with (B x M x K) input. + Similarly, if pretranspose_B is set to true, then B is assumed to be (B x N x K), B must be pre-transposed before passing it to the fixture. */ + + // Define transposed shapes + TensorShape a_transposed_shape(a.shape()); + a_transposed_shape.set(0, a.shape().y()); + a_transposed_shape.set(1, a.shape().x()); + + TensorShape b_transposed_shape(b.shape()); + b_transposed_shape.set(0, b.shape().y()); + b_transposed_shape.set(1, b.shape().x()); + + // Define transposed tensors + SimpleTensor a_transposed{ a_transposed_shape, data_type }; + SimpleTensor b_transposed{ b_transposed_shape, data_type }; + + //pretranspose a if necessary + if(pretranspose_a) + { + a_transposed = reference::permute(a, PermutationVector(1U, 0U)); + } + + // pretranspose b if necessary + if(pretranspose_b) + { + b_transposed = reference::permute(b, PermutationVector(1U, 0U)); + } + + // Use transposed tensors if boolean enabled else use original tensors + SimpleTensor result = reference::gemm((pretranspose_a) ? a_transposed : a, (pretranspose_b) ? b_transposed : b, c, 1.0f, 0.f); + + + // We reshape the gemm output back if the tensor is high dimensional + if(output_shape_collapsed != output_shape) + { + // std::cout << "called reshape: \n"; + result = reference::reshape_layer(result, output_shape); + } + + return result; + } + + CLTensor _target{}; + SimpleTensor _reference{}; + bool _device_supports_export_to_cl_image{ false }; + bool _device_supports_mmul{ false }; +}; + +template +class DynamicFusionGpuMatMulValidationFixture : public DynamicFusionGpuMatMulValidationGenericFixture +{ + public: + void setup(TensorShape lhs_shape, TensorShape rhs_shape, TensorShape output_shape, bool transpose_a, bool transpose_b, + int M0, int N0, int K0, bool export_rhs_to_cl_image, DataType data_type) + { + ARM_COMPUTE_UNUSED(export_rhs_to_cl_image); + DynamicFusionGpuMatMulValidationGenericFixture::setup(lhs_shape, rhs_shape, output_shape, transpose_a, transpose_b, M0, + N0, K0, false /* export_rhs_to_cl_image bias */, data_type); + } +}; + +} // namespace validation +} // namespace test +} // namespace arm_compute +#endif // ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_MATMULKERNELFIXTURE_H -- cgit v1.2.1