From fde45d836cf753a94915ac42d8a13da7edc52221 Mon Sep 17 00:00:00 2001 From: Adnan AlSinan Date: Tue, 24 Oct 2023 12:03:21 +0100 Subject: Extend CKW MatMul with nt_t - Add the kernel variant: (nt_t) to GpuCKWMatMul. - Extend CKW MatMul validation test with nt_t. - Fixes a bug in CKW where z-dim = 1. Resolves: COMPMID-6435 Signed-off-by: Adnan AlSinan Change-Id: I4c5e8791e55f21ffff3c11eca7802c51a4259977 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10525 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gian Marco Iodice Benchmark: Arm Jenkins --- compute_kernel_writer/prototype/src/Prototype.h | 2 +- docs/user_guide/release_version_and_change_log.dox | 1 + .../gpu/ckw_driver/components/GpuCkwMatMul.cpp | 220 ++++++++++++- .../sketch/gpu/components/cl/ClComponentMatMul.cpp | 20 +- .../sketch/gpu/operators/GpuMatMul.cpp | 2 - tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp | 350 +++++++++++++++++++++ .../dynamic_fusion/gpu/cl/MatMulKernelFixture.h | 267 ++++++++++++++++ 7 files changed, 846 insertions(+), 16 deletions(-) create mode 100644 tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp create mode 100644 tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h diff --git a/compute_kernel_writer/prototype/src/Prototype.h b/compute_kernel_writer/prototype/src/Prototype.h index 433eef9e7b..b392fe2651 100644 --- a/compute_kernel_writer/prototype/src/Prototype.h +++ b/compute_kernel_writer/prototype/src/Prototype.h @@ -3050,7 +3050,7 @@ private: address += " * "; address += stride_y; } - if (z != "0" && (_mapper.is_one_component_z() != true)) + if (z != "0") { const std::string stride_z = _mapper.tensor_component_stride_z(); address += " + ("; diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox index 2b8f5d87a1..9e67d7e6b4 100644 --- a/docs/user_guide/release_version_and_change_log.dox +++ b/docs/user_guide/release_version_and_change_log.dox @@ -49,6 +49,7 @@ v23.11 Public major release - @ref experimental::dynamic_fusion::GpuCkwResize - @ref experimental::dynamic_fusion::GpuCkwPool2d - @ref experimental::dynamic_fusion::GpuCkwDepthwiseConv2d + - @ref experimental::dynamic_fusion::GpuCkwMatMul - Add support for OpenCLâ„¢ comand buffer with mutable dispatch extension. - Update OpenCLâ„¢ API headers to v2023.04.17. - Remove legacy PostOps interface. PostOps was the experimental interface for kernel fusion and is replaced by the new Dynamic Fusion interface. diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp index 77e5f7af01..9beba03598 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp @@ -24,9 +24,18 @@ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "support/StringSupport.h" +using namespace ckw; namespace arm_compute { namespace experimental @@ -50,20 +59,225 @@ void GpuCkwMatMul::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const { - ARM_COMPUTE_UNUSED(comp_group, vtable, writer); + const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); + + GpuCkwComponentArgument *lhs = + vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs"); + GpuCkwComponentArgument *rhs = + vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + + // Constants + const int height_idx = get_data_layout_dimension_index(_lhs->data_layout(), DataLayoutDimension::HEIGHT); + const auto &rhs_h = writer->declare_tile("rhs_h", static_cast(_rhs->dimension(height_idx))); + const int m = static_cast(_dst->dimension(1)); + const int n = static_cast(_dst->dimension(0)); + const int k = + _attributes.adj_lhs() ? static_cast(_lhs->tensor_shape().y()) : static_cast(_lhs->tensor_shape().x()); + const int m0 = root_window.y().step(); + const int n0 = root_window.x().step(); + const int k0 = _settings.k0(); + const int partial_store_m0 = m % m0; + const int partial_store_n0 = n % n0; + + const auto &const_1 = writer->declare_tile("1", 1); + auto &const_0 = writer->declare_tile("0", 0); + auto &k0_tile = writer->declare_tile("k0", k0); + auto &k_tile = writer->declare_tile("k", k); + + auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32); + auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32); + auto &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32); + + writer->op_get_global_id(gid_0, 0); + writer->op_get_global_id(gid_1, 1); + writer->op_get_global_id(gid_2, 2); + + auto &x = writer->declare_tile("x", ckw::DataType::Int32); + auto &y = writer->declare_tile("y", ckw::DataType::Int32); + auto &z = writer->declare_tile("z", ckw::DataType::Int32); + + get_coord(writer, x, gid_0, n0, partial_store_n0, "gid_x_", const_0); + get_coord(writer, y, gid_1, m0, partial_store_m0, "gid_y_", const_0); + get_coord(writer, z, gid_2, 1, 0, "gid_z_", const_0); + + TensorTileSampler lhs_sampler; + lhs_sampler.height(m0); + lhs_sampler.width(k0); + lhs_sampler.format(TensorSamplerFormat::C_W_H); + lhs_sampler.address_mode_x(TensorSamplerAddressModeX::None); + lhs_sampler.address_mode_y(TensorSamplerAddressModeY::None); + lhs_sampler.address_mode_z(TensorSamplerAddressModeZ::None); + + TensorTileSampler rhs_sampler; + rhs_sampler.height(k0); + rhs_sampler.width(n0); + rhs_sampler.format(TensorSamplerFormat::C_WH_1); + rhs_sampler.address_mode_x(TensorSamplerAddressModeX::None); + rhs_sampler.address_mode_y(TensorSamplerAddressModeY::None); + rhs_sampler.address_mode_z(TensorSamplerAddressModeZ::None); + + TensorTileSampler dst_sampler; + dst_sampler.width(n0); + dst_sampler.height(m0); + dst_sampler.format(TensorSamplerFormat::C_W_H); + dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin); + dst_sampler.address_mode_y(TensorSamplerAddressModeY::None); + dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None); + dst_sampler.x(x); + dst_sampler.y(y); + dst_sampler.z(z); + dst_sampler.b(const_0); + + if (!dst->has_tile()) + { + auto &dst_tile = writer->declare_tile("dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), m0, n0)); + dst->init_virtual_tensor(dst_tile, dst_sampler); + } + auto &dst_tile = dst->tile(); + + // Initialize the accumulators + writer->op_assign(dst_tile, const_0); + + auto &rhs_z = writer->declare_tile("rhs_z", ckw::DataType::Int32); + writer->op_binary_expression(rhs_z, z, BinaryOp::Mul, rhs_h); + + auto &k_i = writer->declare_tile("k_i", ckw::DataType::Int32); + auto &k_limit = writer->declare_tile("k_limit", k - k0); + + auto &x_i = writer->declare_tile("x_i", ckw::DataType::Int32); + writer->op_assign(x_i, const_0); + + writer->op_assign(k_i, const_0); + + // *INDENT-OFF* + // clang-format off + writer->op_for_loop(k_i, BinaryOp::LessEqual, k_limit, k_i, AssignmentOp::Increment, k0_tile, + [&]() + { + //Initialize tiles + // lhs_tile + auto &a = writer->declare_tile("a", ckw::TileInfo(to_ckw(_lhs->data_type()), m0, k0)); + // rhs_tile + auto &b = writer->declare_tile("b", ckw::TileInfo(to_ckw(_rhs->data_type()), n0, k0)); + writer->op_assign(a, const_0); + writer->op_assign(b, const_0); + + // Loading the tiles + // LHS + lhs_sampler.x(x_i); + lhs_sampler.y(y); + lhs_sampler.z(z); + lhs_sampler.b(const_0); + writer->op_load(a, lhs->tensor(), lhs_sampler); + + // RHS + auto &y_i = writer->declare_tile("y_i", ckw::DataType::Int32); + writer->op_binary_expression(y_i, x, BinaryOp::Add, rhs_z); + rhs_sampler.x(k_i); + rhs_sampler.y(y_i); + rhs_sampler.z(const_0); + rhs_sampler.b(const_0); + writer->op_load(b, rhs->tensor(), rhs_sampler); + + // Perform Matmul + writer->op_binary_expression(dst_tile, a, BinaryOp::MatMul_Nt_T, b); + writer->op_binary_expression(x_i, x_i, BinaryOp::Add, k0_tile); + }); +// *INDENT-ON* + // clang-format on + + // Handling leftovers + if (k % k0 != 0) + { + // *INDENT-OFF* + // clang-format off + writer->op_for_loop(k_i, BinaryOp::Less, k_tile, k_i, AssignmentOp::Increment, const_1, + [&]() + { + //Initialize tiles + // lhs_tile + auto &a = + writer->declare_tile("a_leftover", ckw::TileInfo(to_ckw(_lhs->data_type()), m0, 1)); + // rhs_tile + auto &b = + writer->declare_tile("b_leftover", ckw::TileInfo(to_ckw(_rhs->data_type()), n0, 1)); + writer->op_assign(a, const_0); + writer->op_assign(b, const_0); + + // Loading the tiles + // LHS + lhs_sampler.x(x_i); + lhs_sampler.y(y); + lhs_sampler.z(z); + lhs_sampler.b(const_0); + writer->op_load(a, lhs->tensor(), lhs_sampler); + + // RHS + auto &y_i = writer->declare_tile("y_i_leftover", ckw::DataType::Int32); + writer->op_binary_expression(y_i, x, BinaryOp::Add, rhs_z); + rhs_sampler.x(k_i); + rhs_sampler.y(y_i); + rhs_sampler.z(const_0); + rhs_sampler.b(const_0); + writer->op_load(b, rhs->tensor(), rhs_sampler); + + // Perform Matmul + writer->op_binary_expression(dst_tile, a, BinaryOp::MatMul_Nt_T, b); + writer->op_binary_expression(x_i, x_i, BinaryOp::Add, const_1); + }); +// *INDENT-ON* + // clang-format on + } } Window GpuCkwMatMul::get_window() const { ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); - return Window(); + + const int m = _dst->dimension(1); + const int n = _dst->dimension(0); + const bool adj_lhs = _attributes.adj_lhs(); + + int m0 = adj_lhs ? adjust_vec_size(_settings.m0(), m) : std::min(_settings.m0(), m); + int n0 = adjust_vec_size(_settings.n0(), n); + + // Configure kernel window + Window win = calculate_max_window(_dst->tensor_shape(), Steps(n0, m0)); + win = win.collapse(win, Window::DimZ); + + return win; } std::string GpuCkwMatMul::get_name(const ComponentGroup &comp_group) const { ARM_COMPUTE_UNUSED(comp_group); - return "MatMul"; + std::string kernel_name("mat_mul_native"); + + const int m = _dst->dimension(1); + const int n = _dst->dimension(0); + const int k = _attributes.adj_lhs() ? _lhs->tensor_shape().y() : _lhs->tensor_shape().x(); + + kernel_name += _attributes.adj_lhs() ? "_t" : "_nt"; + kernel_name += _attributes.adj_rhs() ? "_t" : "_nt"; + kernel_name += "_"; + kernel_name += support::cpp11::to_string(m); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(n); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(k); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(_dst->dimension(2)); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(_settings.m0()); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(_settings.n0()); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(_settings.k0()); + + return kernel_name; } } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp index eada61e1b3..f238d42d98 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp @@ -91,14 +91,16 @@ Status ClComponentMatMul::validate(const Properties &properties, const auto rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1); const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0); + // Currently, the only supported case is when adj_lhs = false and adj_rhs = true + ARM_COMPUTE_RETURN_ERROR_ON((attributes.adj_lhs() != false) && (attributes.adj_rhs() != true)); + // Check if Matching data type ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); // Data type ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32); - // Data layout - ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(lhs, DataLayout::NHWC); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); // All tensor infos are initialized ARM_COMPUTE_RETURN_ERROR_ON(lhs->tensor_shape().total_size() == 0); @@ -108,20 +110,18 @@ Status ClComponentMatMul::validate(const Properties &properties, // Device requirements are met ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(lhs); - // Check if dst shape is correct + // Check if block sizes are supported MatMulKernelInfo matmul_kernel_info = MatMulKernelInfo(attributes.adj_lhs(), attributes.adj_rhs(), settings.m0(), settings.n0(), settings.k0()); - const auto expected_dst_shape = - misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), expected_dst_shape); - - // Check if block sizes are supported ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(attributes, settings)); - ARM_COMPUTE_RETURN_ON_ERROR( opencl::kernels::validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); + // Check if dst shape is correct + const auto expected_dst_shape = + misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), expected_dst_shape); + return Status{}; } diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp index ee27b5ea47..e24629a036 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp @@ -87,8 +87,6 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, // Check support level // Data type ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32); - // Data layout - ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(lhs, DataLayout::NHWC); // Check components if (context.gpu_language() == GpuLanguage::OpenCL) diff --git a/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp b/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp new file mode 100644 index 0000000000..38c3a0ca0e --- /dev/null +++ b/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef ACL_INTERNAL_TEST_CKW_IN_DF +#include "tests/AssetsLibrary.h" +#include "tests/CL/CLAccessor.h" +#include "tests/framework/Fixture.h" +#include "tests/framework/Macros.h" +#include "tests/framework/datasets/Datasets.h" +#include "tests/datasets/LargeMatMulDataset.h" +#include "tests/datasets/SmallMatMulDataset.h" +#include "tests/validation/Validation.h" +#include "tests/validation/reference/Permute.h" +#include "tests/validation/reference/GEMM.h" + +#include "tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h" + +#include + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +namespace +{ + RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ +constexpr float abs_tolerance_f32( + 0.0001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for floating point data types in case using relative tolerance fails because of small values */ +constexpr float abs_tolerance_f16( + 0.001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp16 data types in case using relative tolerance fails because of small values */ + RelativeTolerance tolerance_f16(half(0.02)); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ +} + +/** M0 values to test --precommit*/ +const auto m0_values_precommit = framework::dataset::make("M0", { 1, 3 }); + +/** N0 values to test --precommit*/ +const auto n0_values_precommit = framework::dataset::make("N0", { 1, 2, 4 }); + +/** K0 values to test --precommit*/ +const auto k0_values_precommit = framework::dataset::make("K0", { 1, 2, 3 }); + +/** M0 values to test --nightly*/ +const auto m0_values_nightly_lhs_nt = framework::dataset::make("M0", { 1, 2, 3, 4, 5, 6, 7, 8 }); +const auto m0_values_nightly_lhs_t = framework::dataset::make("M0", { 1, 2, 3, 4, 8 }); + +/** N0 values to test --nightly*/ +const auto n0_values_nightly_rhs_nt = framework::dataset::make("N0", { 1, 2, 3, 4, 8, 16 }); +const auto n0_values_nightly_rhs_t = framework::dataset::make("N0", { 1, 2, 3, 4, 8 }); + +/** K0 values to test --nightly*/ +const auto k0_values_nightly_lhs_nt_rhs_nt = framework::dataset::make("K0", { 1, 2, 3, 4, 8, 16 }); +const auto k0_values_nightly_rhs_t = framework::dataset::make("K0", { 1, 2, 3, 4, 8 }); +const auto k0_values_nightly_lhs_t_rhs_nt = framework::dataset::make("K0", { 1, 2, 3, 4, 5, 6, 7, 8 }); + +TEST_SUITE(CL) +TEST_SUITE(DYNAMIC_FUSION) + +TEST_SUITE(MatMul) + +TEST_SUITE(Validate) +TEST_CASE(SupportedBlockSizes, framework::DatasetMode::ALL) +{ + using MatMulConfigurationPair = std::pair; + + const std::vector supported_block_sizes = + { + // MatMulKernelInfo(adj_lhs, adj_rhs, M0, N0, K0, export_rhs_to_cl_image = false) + + // Lhs not-transposed, Rhs transposed + { MatMulKernelInfo(false, true, 0, 1, 1), false }, // M0 should be > 0 + { MatMulKernelInfo(false, true, 3, 11, 1), false }, // N0 not in {1, 2, 3, 4, 8, 16} + { MatMulKernelInfo(false, true, 3, 7, 1), false }, // N0 not in {1, 2, 3, 4, 8, 16} + { MatMulKernelInfo(false, true, 3, 3, 12), false }, // K0 not in {1, 2, 3, 4, 8, 16} + { MatMulKernelInfo(false, true, 3, 3, 6), false }, // K0 not in {1, 2, 3, 4, 8, 16} + { MatMulKernelInfo(false, true, 5, 1, 2), true }, + { MatMulKernelInfo(false, true, 3, 3, 3), true }, + { MatMulKernelInfo(false, true, 2, 4, 8), true }, + + }; + + // Create a new workload sketch + auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); + auto context = GpuWorkloadContext{ &cl_compile_ctx }; + GpuWorkloadSketch sketch{ &context }; + + // Set big enough shapes so that block sizes are not truncated. Also, set all dimensions equal + // so that it doesn't fail for different NT/T configurations. We aim to test the block sizes here, + // not the shapes themselves. + const TensorInfo lhs_info = context.create_tensor_info(TensorInfo(TensorShape(100U, 100U), 1, DataType::F32)); + const TensorInfo rhs_info = context.create_tensor_info(TensorInfo(TensorShape(100U, 100U), 1, DataType::F32)); + + for(auto &pair : supported_block_sizes) + { + MatMulAttributes matmul_attr {}; + matmul_attr.adj_lhs(pair.first.adj_lhs); + matmul_attr.adj_rhs(pair.first.adj_rhs); + + GpuMatMulSettings matmul_settings {}; + matmul_settings.m0(pair.first.m0); + matmul_settings.n0(pair.first.n0); + matmul_settings.k0(pair.first.k0); + + Status status = GpuMatMul::validate_op(sketch, &lhs_info, &rhs_info, matmul_attr, matmul_settings); + ARM_COMPUTE_EXPECT(bool(status) == pair.second, framework::LogLevel::ERRORS); + } +} + +TEST_CASE(ValidateInputShapes, framework::DatasetMode::ALL) +{ + // Create a sketch + auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); + auto context = GpuWorkloadContext{ &cl_compile_ctx }; + GpuWorkloadSketch sketch{ &context }; + + // Configurations are assumed to be Nt/Nt, but will be transposed inside the test to test other configurations + using ShapeConfigurationTuple = std::tuple; + const std::vector shape_configurations = + { + { TensorShape(5U, 1U), TensorShape(3U, 5U), true }, + { TensorShape(10U, 12U), TensorShape(3U, 10U), true }, + { TensorShape(8U, 4U), TensorShape(2U, 8U), true }, + { TensorShape(8U, 4U), TensorShape(2U, 5U), false }, // Mismatch in the K dimension + { TensorShape(5U, 0U), TensorShape(2U, 5U), false }, // Invalid dimension + { TensorShape(5U, 4U, 3U, 4U, 5U, 6U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), true }, + { TensorShape(5U, 4U, 3U, 4U, 5U, 1U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), false }, // no batch broadcasting + { TensorShape(5U, 4U, 3U, 4U, 9U, 6U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), false }, // mismatch in batch dimension + }; + + for(auto &tuple : shape_configurations) + { + const bool expected = std::get<2>(tuple); + + for(bool adj_lhs : + { + false + }) + { + for(bool adj_rhs : + { + true + }) + { + TensorShape lhs_shape = std::get<0>(tuple); + TensorShape rhs_shape = std::get<1>(tuple); + + if(adj_lhs) + { + permute(lhs_shape, PermutationVector(1U, 0U)); + } + + if(adj_rhs) + { + permute(rhs_shape, PermutationVector(1U, 0U)); + } + + const TensorInfo lhs_info = context.create_tensor_info(TensorInfo(lhs_shape, 1, DataType::F32)); + const TensorInfo rhs_info = context.create_tensor_info(TensorInfo(rhs_shape, 1, DataType::F32)); + + MatMulAttributes matmul_attr {}; + matmul_attr.adj_lhs(adj_lhs); + matmul_attr.adj_rhs(adj_rhs); + + GpuMatMulSettings matmul_settings {}; + matmul_settings.m0(1); + matmul_settings.n0(1); + matmul_settings.k0(1); + + Status status = GpuMatMul::validate_op(sketch, &lhs_info, &rhs_info, matmul_attr, matmul_settings); + ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS); + } + } + } +} + + +TEST_CASE(ValidateDataTypes, framework::DatasetMode::ALL) +{ + // Configurations are assumed to be Nt/Nt, but will be transposed inside the test to test other configurations + using DataTypeConfigurationTuple = std::tuple; + const std::vector data_type_configurations = + { + { DataType::F32, DataType::F32, DataType::F32, true }, + { DataType::F16, DataType::F16, DataType::F16, true }, + { DataType::F16, DataType::F32, DataType::F32, false }, // no mixed precision + { DataType::F64, DataType::F64, DataType::F64, false }, // no double precision + { DataType::QASYMM8, DataType::QASYMM8, DataType::QASYMM8, false }, // no quantized types + { DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, false }, // no quantized types + { DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8_PER_CHANNEL, false }, // no quantized types + { DataType::QASYMM16, DataType::QASYMM16, DataType::QASYMM16, false }, // no quantized types + { DataType::QSYMM16, DataType::QSYMM16, DataType::QSYMM16, false }, // no quantized types + { DataType::QSYMM8, DataType::QSYMM8, DataType::QSYMM8, false }, // no quantized types + { DataType::S64, DataType::S64, DataType::S64, false }, // no integral types + { DataType::S32, DataType::S32, DataType::S32, false }, // no integral types + { DataType::S16, DataType::S16, DataType::S16, false }, // no integral types + { DataType::S8, DataType::S8, DataType::S8, false }, // no integral types + { DataType::U64, DataType::U64, DataType::U64, false }, // no integral types + { DataType::U32, DataType::U32, DataType::U32, false }, // no integral types + { DataType::U16, DataType::U16, DataType::U16, false }, // no integral types + { DataType::U8, DataType::U8, DataType::U8, false }, // no integral types + }; + // Create a sketch + auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); + auto context = GpuWorkloadContext{ &cl_compile_ctx }; + GpuWorkloadSketch sketch{ &context }; + + const TensorShape shape = TensorShape(10U, 10U); + MatMulAttributes matmul_attr {}; + matmul_attr.adj_lhs(false); + matmul_attr.adj_rhs(false); + GpuMatMulSettings matmul_settings {}; + matmul_settings.m0(1); + matmul_settings.n0(1); + matmul_settings.k0(1); + + for(auto &tuple : data_type_configurations) + { + const bool expected = std::get<3>(tuple); + + const TensorInfo lhs_info = context.create_tensor_info(TensorInfo(shape, 1, std::get<0>(tuple))); + const TensorInfo rhs_info = context.create_tensor_info(TensorInfo(shape, 1, std::get<1>(tuple))); + + Status status = GpuMatMul::validate_op(sketch, &lhs_info, &rhs_info, matmul_attr, matmul_settings); + ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS); + } +} + +TEST_SUITE_END() // Validate + +template +using DynamicFusionGpuMatmulFixture = DynamicFusionGpuMatMulValidationFixture; + +TEST_SUITE(Float) +TEST_SUITE(FP32) + +FIXTURE_DATA_TEST_CASE(RunTiny, DynamicFusionGpuMatmulFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::TinyMatMulDataset(), + framework::dataset::make("TransposeA", { false })), + framework::dataset::make("TransposeB", { true })), + m0_values_precommit), + n0_values_precommit), + k0_values_precommit), + framework::dataset::make("ExportRhsToCLImage", { false })), + framework::dataset::make("DataType", DataType::F32))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32); +} + +FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuMatmulFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulDataset(), + framework::dataset::make("TransposeA", { false })), + framework::dataset::make("TransposeB", { true })), + m0_values_precommit), + n0_values_precommit), + k0_values_precommit), + framework::dataset::make("ExportRhsToCLImage", { false })), + framework::dataset::make("DataType", DataType::F32))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32); +} + +FIXTURE_DATA_TEST_CASE(RunLargeRhsTransposed, DynamicFusionGpuMatmulFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(), + framework::dataset::make("TransposeA", { false })), + framework::dataset::make("TransposeB", { true })), + m0_values_nightly_lhs_nt), + n0_values_nightly_rhs_t), + k0_values_nightly_rhs_t), + framework::dataset::make("ExportRhsToCLImage", { false })), + framework::dataset::make("DataType", DataType::F32))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32); +} + +// Running High Dimensional test is enough for FP32, because we're stressing the number of dimensions, not data type or M0/N0/K0 +FIXTURE_DATA_TEST_CASE(RunHighDimensional, DynamicFusionGpuMatmulFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::HighDimensionalMatMulDataset(), + framework::dataset::make("TransposeA", { false })), + framework::dataset::make("TransposeB", { true })), + framework::dataset::make("M0", { 2 })), + framework::dataset::make("N0", { 2 })), + framework::dataset::make("K0", { 2 })), + framework::dataset::make("ExportRhsToCLImage", { false })), + framework::dataset::make("DataType", DataType::F32))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32); +} +TEST_SUITE_END() // FP32 + +TEST_SUITE(FP16) + +FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuMatmulFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulDataset(), + framework::dataset::make("TransposeA", { false })), + framework::dataset::make("TransposeB", { true })), + m0_values_precommit), + n0_values_precommit), + k0_values_precommit), + framework::dataset::make("ExportRhsToCLImage", { false })), + framework::dataset::make("DataType", DataType::F16))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16); +} + + +FIXTURE_DATA_TEST_CASE(RunLargeRhsTransposed, DynamicFusionGpuMatmulFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(), + framework::dataset::make("TransposeA", { false })), + framework::dataset::make("TransposeB", { true })), + m0_values_nightly_lhs_nt), + n0_values_nightly_rhs_t), + k0_values_nightly_rhs_t), + framework::dataset::make("ExportRhsToCLImage", { false })), + framework::dataset::make("DataType", DataType::F16))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16); +} + +TEST_SUITE_END() // FP16 + +TEST_SUITE_END() // Float +TEST_SUITE_END() // MatMul +TEST_SUITE_END() // DYNAMIC_FUSION +TEST_SUITE_END() // CL +} // namespace validation +} // namespace test +} // namespace arm_compute +#endif // ACL_INTERNAL_TEST_CKW_IN_DF diff --git a/tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h b/tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h new file mode 100644 index 0000000000..c6ac4b91db --- /dev/null +++ b/tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_MATMULKERNELFIXTURE_H +#define ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_MATMULKERNELFIXTURE_H + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h" +#include "arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h" +#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMatMul.h" +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h" + +#include "tests/CL/CLAccessor.h" +#include "tests/framework/Fixture.h" +#include "tests/framework/Macros.h" +#include "tests/validation/Helpers.h" +#include "tests/validation/Validation.h" +#include "tests/validation/reference/GEMM.h" +#include "tests/validation/reference/Permute.h" +#include "tests/validation/reference/ReshapeLayer.h" + +using namespace arm_compute::experimental::dynamic_fusion; + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +namespace +{ +template +void fill(U &&tensor, int i) +{ + switch(tensor.data_type()) + { + case DataType::F16: + { + arm_compute::utils::uniform_real_distribution_16bit distribution{ -1.0f, 1.0f }; + library->fill(tensor, distribution, i); + break; + } + case DataType::F32: + { + std::uniform_real_distribution distribution(-1.0f, 1.0f); + library->fill(tensor, distribution, i); + break; + } + default: + library->fill_tensor_uniform(tensor, i); + } +} + +} // namespace +template +class DynamicFusionGpuMatMulValidationGenericFixture : public framework::Fixture +{ + +public: + void setup(TensorShape lhs_shape, TensorShape rhs_shape, TensorShape output_shape, bool transpose_a, bool transpose_b, + int M0, int N0, int K0, bool export_rhs_to_cl_image, DataType data_type) + { + //For brevity, the input shapes are assumed to be not-transposed for both a and b matrices. + if(transpose_a) + { + permute(lhs_shape, PermutationVector(1U, 0U)); + } + if(transpose_b) + { + permute(rhs_shape, PermutationVector(1U, 0U)); + } + + // Skip configurations unsupported by the device. + _device_supports_export_to_cl_image = image2d_from_buffer_supported(CLKernelLibrary::get().get_device()); + if(!_device_supports_export_to_cl_image && export_rhs_to_cl_image) + { + ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped"); + framework::ARM_COMPUTE_PRINT_INFO(); + return; // Note: Also need to skip the validate in corresponding FIXTURE_DATA_TEST_CASEs. + } + + _target = compute_target(lhs_shape, rhs_shape, transpose_a, transpose_b, M0, N0, K0, export_rhs_to_cl_image, data_type); + _reference = compute_reference(lhs_shape, rhs_shape, output_shape, transpose_a, transpose_b, data_type); + } + +protected: + TensorType compute_target(TensorShape &shape_a, TensorShape &shape_b, bool transpose_a, bool transpose_b, int M0, int N0, int K0, bool export_rhs_to_cl_image, DataType data_type) + { + ARM_COMPUTE_UNUSED(export_rhs_to_cl_image); + CLScheduler::get().default_reinit(); + + // Create a new workload sketch + auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); + auto context = GpuWorkloadContext{ &cl_compile_ctx }; + GpuWorkloadSketch sketch{ &context }; + + // Create sketch tensors + TensorInfo lhs_info = context.create_tensor_info(TensorInfo(shape_a, 1, data_type)); + TensorInfo rhs_info = context.create_tensor_info(TensorInfo(shape_b, 1, data_type)); + TensorInfo dst_info = context.create_tensor_info(); + + MatMulAttributes matmul_attr {}; + matmul_attr.adj_lhs(transpose_a); + matmul_attr.adj_rhs(transpose_b); + + GpuMatMulSettings matmul_settings {}; + matmul_settings.m0(M0); + matmul_settings.n0(N0); + matmul_settings.k0(K0); + + ITensorInfo *ans_info = FunctionType::create_op(sketch, &lhs_info, &rhs_info, matmul_attr, matmul_settings); + GpuOutput::create_op(sketch, ans_info, &dst_info); + + // Configure runtime + ClWorkloadRuntime runtime; + runtime.configure(sketch); + + for(auto &data : runtime.get_auxiliary_tensors()) + { + CLTensor *tensor = std::get<0>(data); + TensorInfo info = std::get<1>(data); + AuxMemoryInfo aux_mem_req = std::get<2>(data); + tensor->allocator()->init(info, aux_mem_req.alignment); + tensor->allocator()->allocate(); // Use ACL allocated memory + } + + // Construct user tensors + TensorType t_lhs{}; + TensorType t_rhs{}; + TensorType t_dst{}; + + // Initialize user tensors + t_lhs.allocator()->init(lhs_info); + t_rhs.allocator()->init(rhs_info); + t_dst.allocator()->init(dst_info); + + ARM_COMPUTE_ASSERT(t_lhs.info()->is_resizable()); + ARM_COMPUTE_ASSERT(t_rhs.info()->is_resizable()); + ARM_COMPUTE_ASSERT(t_dst.info()->is_resizable()); + + // Allocate and fill user tensors + t_lhs.allocator()->allocate(); + t_rhs.allocator()->allocate(); + t_dst.allocator()->allocate(); + + ARM_COMPUTE_ASSERT(!t_lhs.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!t_rhs.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!t_dst.info()->is_resizable()); + + fill(AccessorType(t_lhs), 0); + fill(AccessorType(t_rhs), 1); + + // Run runtime + runtime.run({ &t_lhs, &t_rhs, &t_dst }); + + return t_dst; + } + + SimpleTensor compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &output_shape, bool pretranspose_a, bool pretranspose_b, DataType data_type) + { + // We collapse dimensions > 3 onto dimension 3, i.e. 5D+ tensors will look like 4D + // This is necessary unless we choose to extend gemm reference for 5D+ tensors + TensorShape output_shape_collapsed = output_shape.collapsed_from(Window::DimZ); + TensorShape shape_a_collapsed = shape_a.collapsed_from(Window::DimZ); + TensorShape shape_b_collapsed = shape_b.collapsed_from(Window::DimZ); + + // Create reference + SimpleTensor a{ shape_a_collapsed, data_type, 1 }; + SimpleTensor b{ shape_b_collapsed, data_type, 1 }; + SimpleTensor c{ output_shape_collapsed, data_type, 1 }; + + // Fill reference + fill(a, 0); + fill(b, 1); + + /* Note: Assuming the usual batch matmul dimensions A = (B x M x K), B = (B x K x N), if pretranspose_A is set to true, then A is assumed to be (B x K x M), + therefore, A must be pre-transposed before passing it to the fixture. And, we transpose A again in the fixture to make it (B x M x K) + in order to be able to call reference implementation that works with (B x M x K) input. + Similarly, if pretranspose_B is set to true, then B is assumed to be (B x N x K), B must be pre-transposed before passing it to the fixture. */ + + // Define transposed shapes + TensorShape a_transposed_shape(a.shape()); + a_transposed_shape.set(0, a.shape().y()); + a_transposed_shape.set(1, a.shape().x()); + + TensorShape b_transposed_shape(b.shape()); + b_transposed_shape.set(0, b.shape().y()); + b_transposed_shape.set(1, b.shape().x()); + + // Define transposed tensors + SimpleTensor a_transposed{ a_transposed_shape, data_type }; + SimpleTensor b_transposed{ b_transposed_shape, data_type }; + + //pretranspose a if necessary + if(pretranspose_a) + { + a_transposed = reference::permute(a, PermutationVector(1U, 0U)); + } + + // pretranspose b if necessary + if(pretranspose_b) + { + b_transposed = reference::permute(b, PermutationVector(1U, 0U)); + } + + // Use transposed tensors if boolean enabled else use original tensors + SimpleTensor result = reference::gemm((pretranspose_a) ? a_transposed : a, (pretranspose_b) ? b_transposed : b, c, 1.0f, 0.f); + + + // We reshape the gemm output back if the tensor is high dimensional + if(output_shape_collapsed != output_shape) + { + // std::cout << "called reshape: \n"; + result = reference::reshape_layer(result, output_shape); + } + + return result; + } + + CLTensor _target{}; + SimpleTensor _reference{}; + bool _device_supports_export_to_cl_image{ false }; + bool _device_supports_mmul{ false }; +}; + +template +class DynamicFusionGpuMatMulValidationFixture : public DynamicFusionGpuMatMulValidationGenericFixture +{ + public: + void setup(TensorShape lhs_shape, TensorShape rhs_shape, TensorShape output_shape, bool transpose_a, bool transpose_b, + int M0, int N0, int K0, bool export_rhs_to_cl_image, DataType data_type) + { + ARM_COMPUTE_UNUSED(export_rhs_to_cl_image); + DynamicFusionGpuMatMulValidationGenericFixture::setup(lhs_shape, rhs_shape, output_shape, transpose_a, transpose_b, M0, + N0, K0, false /* export_rhs_to_cl_image bias */, data_type); + } +}; + +} // namespace validation +} // namespace test +} // namespace arm_compute +#endif // ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_MATMULKERNELFIXTURE_H -- cgit v1.2.1