From fde45d836cf753a94915ac42d8a13da7edc52221 Mon Sep 17 00:00:00 2001 From: Adnan AlSinan Date: Tue, 24 Oct 2023 12:03:21 +0100 Subject: Extend CKW MatMul with nt_t - Add the kernel variant: (nt_t) to GpuCKWMatMul. - Extend CKW MatMul validation test with nt_t. - Fixes a bug in CKW where z-dim = 1. Resolves: COMPMID-6435 Signed-off-by: Adnan AlSinan Change-Id: I4c5e8791e55f21ffff3c11eca7802c51a4259977 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10525 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gian Marco Iodice Benchmark: Arm Jenkins --- .../gpu/ckw_driver/components/GpuCkwMatMul.cpp | 220 ++++++++++++++++++++- .../sketch/gpu/components/cl/ClComponentMatMul.cpp | 20 +- .../sketch/gpu/operators/GpuMatMul.cpp | 2 - 3 files changed, 227 insertions(+), 15 deletions(-) (limited to 'src') diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp index 77e5f7af01..9beba03598 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp @@ -24,9 +24,18 @@ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "support/StringSupport.h" +using namespace ckw; namespace arm_compute { namespace experimental @@ -50,20 +59,225 @@ void GpuCkwMatMul::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const { - ARM_COMPUTE_UNUSED(comp_group, vtable, writer); + const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); + + GpuCkwComponentArgument *lhs = + vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs"); + GpuCkwComponentArgument *rhs = + vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + + // Constants + const int height_idx = get_data_layout_dimension_index(_lhs->data_layout(), DataLayoutDimension::HEIGHT); + const auto &rhs_h = writer->declare_tile("rhs_h", static_cast(_rhs->dimension(height_idx))); + const int m = static_cast(_dst->dimension(1)); + const int n = static_cast(_dst->dimension(0)); + const int k = + _attributes.adj_lhs() ? static_cast(_lhs->tensor_shape().y()) : static_cast(_lhs->tensor_shape().x()); + const int m0 = root_window.y().step(); + const int n0 = root_window.x().step(); + const int k0 = _settings.k0(); + const int partial_store_m0 = m % m0; + const int partial_store_n0 = n % n0; + + const auto &const_1 = writer->declare_tile("1", 1); + auto &const_0 = writer->declare_tile("0", 0); + auto &k0_tile = writer->declare_tile("k0", k0); + auto &k_tile = writer->declare_tile("k", k); + + auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32); + auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32); + auto &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32); + + writer->op_get_global_id(gid_0, 0); + writer->op_get_global_id(gid_1, 1); + writer->op_get_global_id(gid_2, 2); + + auto &x = writer->declare_tile("x", ckw::DataType::Int32); + auto &y = writer->declare_tile("y", ckw::DataType::Int32); + auto &z = writer->declare_tile("z", ckw::DataType::Int32); + + get_coord(writer, x, gid_0, n0, partial_store_n0, "gid_x_", const_0); + get_coord(writer, y, gid_1, m0, partial_store_m0, "gid_y_", const_0); + get_coord(writer, z, gid_2, 1, 0, "gid_z_", const_0); + + TensorTileSampler lhs_sampler; + lhs_sampler.height(m0); + lhs_sampler.width(k0); + lhs_sampler.format(TensorSamplerFormat::C_W_H); + lhs_sampler.address_mode_x(TensorSamplerAddressModeX::None); + lhs_sampler.address_mode_y(TensorSamplerAddressModeY::None); + lhs_sampler.address_mode_z(TensorSamplerAddressModeZ::None); + + TensorTileSampler rhs_sampler; + rhs_sampler.height(k0); + rhs_sampler.width(n0); + rhs_sampler.format(TensorSamplerFormat::C_WH_1); + rhs_sampler.address_mode_x(TensorSamplerAddressModeX::None); + rhs_sampler.address_mode_y(TensorSamplerAddressModeY::None); + rhs_sampler.address_mode_z(TensorSamplerAddressModeZ::None); + + TensorTileSampler dst_sampler; + dst_sampler.width(n0); + dst_sampler.height(m0); + dst_sampler.format(TensorSamplerFormat::C_W_H); + dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin); + dst_sampler.address_mode_y(TensorSamplerAddressModeY::None); + dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None); + dst_sampler.x(x); + dst_sampler.y(y); + dst_sampler.z(z); + dst_sampler.b(const_0); + + if (!dst->has_tile()) + { + auto &dst_tile = writer->declare_tile("dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), m0, n0)); + dst->init_virtual_tensor(dst_tile, dst_sampler); + } + auto &dst_tile = dst->tile(); + + // Initialize the accumulators + writer->op_assign(dst_tile, const_0); + + auto &rhs_z = writer->declare_tile("rhs_z", ckw::DataType::Int32); + writer->op_binary_expression(rhs_z, z, BinaryOp::Mul, rhs_h); + + auto &k_i = writer->declare_tile("k_i", ckw::DataType::Int32); + auto &k_limit = writer->declare_tile("k_limit", k - k0); + + auto &x_i = writer->declare_tile("x_i", ckw::DataType::Int32); + writer->op_assign(x_i, const_0); + + writer->op_assign(k_i, const_0); + + // *INDENT-OFF* + // clang-format off + writer->op_for_loop(k_i, BinaryOp::LessEqual, k_limit, k_i, AssignmentOp::Increment, k0_tile, + [&]() + { + //Initialize tiles + // lhs_tile + auto &a = writer->declare_tile("a", ckw::TileInfo(to_ckw(_lhs->data_type()), m0, k0)); + // rhs_tile + auto &b = writer->declare_tile("b", ckw::TileInfo(to_ckw(_rhs->data_type()), n0, k0)); + writer->op_assign(a, const_0); + writer->op_assign(b, const_0); + + // Loading the tiles + // LHS + lhs_sampler.x(x_i); + lhs_sampler.y(y); + lhs_sampler.z(z); + lhs_sampler.b(const_0); + writer->op_load(a, lhs->tensor(), lhs_sampler); + + // RHS + auto &y_i = writer->declare_tile("y_i", ckw::DataType::Int32); + writer->op_binary_expression(y_i, x, BinaryOp::Add, rhs_z); + rhs_sampler.x(k_i); + rhs_sampler.y(y_i); + rhs_sampler.z(const_0); + rhs_sampler.b(const_0); + writer->op_load(b, rhs->tensor(), rhs_sampler); + + // Perform Matmul + writer->op_binary_expression(dst_tile, a, BinaryOp::MatMul_Nt_T, b); + writer->op_binary_expression(x_i, x_i, BinaryOp::Add, k0_tile); + }); +// *INDENT-ON* + // clang-format on + + // Handling leftovers + if (k % k0 != 0) + { + // *INDENT-OFF* + // clang-format off + writer->op_for_loop(k_i, BinaryOp::Less, k_tile, k_i, AssignmentOp::Increment, const_1, + [&]() + { + //Initialize tiles + // lhs_tile + auto &a = + writer->declare_tile("a_leftover", ckw::TileInfo(to_ckw(_lhs->data_type()), m0, 1)); + // rhs_tile + auto &b = + writer->declare_tile("b_leftover", ckw::TileInfo(to_ckw(_rhs->data_type()), n0, 1)); + writer->op_assign(a, const_0); + writer->op_assign(b, const_0); + + // Loading the tiles + // LHS + lhs_sampler.x(x_i); + lhs_sampler.y(y); + lhs_sampler.z(z); + lhs_sampler.b(const_0); + writer->op_load(a, lhs->tensor(), lhs_sampler); + + // RHS + auto &y_i = writer->declare_tile("y_i_leftover", ckw::DataType::Int32); + writer->op_binary_expression(y_i, x, BinaryOp::Add, rhs_z); + rhs_sampler.x(k_i); + rhs_sampler.y(y_i); + rhs_sampler.z(const_0); + rhs_sampler.b(const_0); + writer->op_load(b, rhs->tensor(), rhs_sampler); + + // Perform Matmul + writer->op_binary_expression(dst_tile, a, BinaryOp::MatMul_Nt_T, b); + writer->op_binary_expression(x_i, x_i, BinaryOp::Add, const_1); + }); +// *INDENT-ON* + // clang-format on + } } Window GpuCkwMatMul::get_window() const { ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); - return Window(); + + const int m = _dst->dimension(1); + const int n = _dst->dimension(0); + const bool adj_lhs = _attributes.adj_lhs(); + + int m0 = adj_lhs ? adjust_vec_size(_settings.m0(), m) : std::min(_settings.m0(), m); + int n0 = adjust_vec_size(_settings.n0(), n); + + // Configure kernel window + Window win = calculate_max_window(_dst->tensor_shape(), Steps(n0, m0)); + win = win.collapse(win, Window::DimZ); + + return win; } std::string GpuCkwMatMul::get_name(const ComponentGroup &comp_group) const { ARM_COMPUTE_UNUSED(comp_group); - return "MatMul"; + std::string kernel_name("mat_mul_native"); + + const int m = _dst->dimension(1); + const int n = _dst->dimension(0); + const int k = _attributes.adj_lhs() ? _lhs->tensor_shape().y() : _lhs->tensor_shape().x(); + + kernel_name += _attributes.adj_lhs() ? "_t" : "_nt"; + kernel_name += _attributes.adj_rhs() ? "_t" : "_nt"; + kernel_name += "_"; + kernel_name += support::cpp11::to_string(m); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(n); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(k); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(_dst->dimension(2)); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(_settings.m0()); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(_settings.n0()); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(_settings.k0()); + + return kernel_name; } } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp index eada61e1b3..f238d42d98 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp @@ -91,14 +91,16 @@ Status ClComponentMatMul::validate(const Properties &properties, const auto rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1); const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0); + // Currently, the only supported case is when adj_lhs = false and adj_rhs = true + ARM_COMPUTE_RETURN_ERROR_ON((attributes.adj_lhs() != false) && (attributes.adj_rhs() != true)); + // Check if Matching data type ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); // Data type ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32); - // Data layout - ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(lhs, DataLayout::NHWC); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); // All tensor infos are initialized ARM_COMPUTE_RETURN_ERROR_ON(lhs->tensor_shape().total_size() == 0); @@ -108,20 +110,18 @@ Status ClComponentMatMul::validate(const Properties &properties, // Device requirements are met ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(lhs); - // Check if dst shape is correct + // Check if block sizes are supported MatMulKernelInfo matmul_kernel_info = MatMulKernelInfo(attributes.adj_lhs(), attributes.adj_rhs(), settings.m0(), settings.n0(), settings.k0()); - const auto expected_dst_shape = - misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), expected_dst_shape); - - // Check if block sizes are supported ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(attributes, settings)); - ARM_COMPUTE_RETURN_ON_ERROR( opencl::kernels::validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); + // Check if dst shape is correct + const auto expected_dst_shape = + misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), expected_dst_shape); + return Status{}; } diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp index ee27b5ea47..e24629a036 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp @@ -87,8 +87,6 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, // Check support level // Data type ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32); - // Data layout - ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(lhs, DataLayout::NHWC); // Check components if (context.gpu_language() == GpuLanguage::OpenCL) -- cgit v1.2.1