/* * Copyright (c) 2023 Arm Limited. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "src/core/helpers/WindowHelpers.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" using namespace ckw; namespace arm_compute { namespace experimental { namespace dynamic_fusion { GpuCkwMatMul::GpuCkwMatMul(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes, const Settings &settings) : IGpuCkwComponentDriver{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes}, _settings{settings} { _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst); } void GpuCkwMatMul::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const { const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs"); GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs"); GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); // Constants const int height_idx = get_data_layout_dimension_index(_lhs->data_layout(), DataLayoutDimension::HEIGHT); const auto &rhs_h = writer->declare_tile("rhs_h", static_cast(_rhs->dimension(height_idx))); const int m = static_cast(_dst->dimension(1)); const int n = static_cast(_dst->dimension(0)); const int k = _attributes.adj_lhs() ? static_cast(_lhs->tensor_shape().y()) : static_cast(_lhs->tensor_shape().x()); const int m0 = root_window.y().step(); const int n0 = root_window.x().step(); const int k0 = _settings.k0(); const int partial_store_m0 = m % m0; const int partial_store_n0 = n % n0; const auto &const_1 = writer->declare_tile("1", 1); auto &const_0 = writer->declare_tile("0", 0); auto &k0_tile = writer->declare_tile("k0", k0); auto &k_tile = writer->declare_tile("k", k); auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32); auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32); auto &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32); writer->op_get_global_id(gid_0, 0); writer->op_get_global_id(gid_1, 1); writer->op_get_global_id(gid_2, 2); auto &x = writer->declare_tile("x", ckw::DataType::Int32); auto &y = writer->declare_tile("y", ckw::DataType::Int32); auto &z = writer->declare_tile("z", ckw::DataType::Int32); get_coord(writer, x, gid_0, n0, partial_store_n0, "gid_x_", const_0); get_coord(writer, y, gid_1, m0, partial_store_m0, "gid_y_", const_0); get_coord(writer, z, gid_2, 1, 0, "gid_z_", const_0); TensorTileSampler lhs_sampler; lhs_sampler.height(m0); lhs_sampler.width(k0); lhs_sampler.format(TensorSamplerFormat::C_W_H); lhs_sampler.address_mode_x(TensorSamplerAddressModeX::None); lhs_sampler.address_mode_y(TensorSamplerAddressModeY::None); lhs_sampler.address_mode_z(TensorSamplerAddressModeZ::None); TensorTileSampler rhs_sampler; rhs_sampler.height(k0); rhs_sampler.width(n0); rhs_sampler.format(TensorSamplerFormat::C_WH_1); rhs_sampler.address_mode_x(TensorSamplerAddressModeX::None); rhs_sampler.address_mode_y(TensorSamplerAddressModeY::None); rhs_sampler.address_mode_z(TensorSamplerAddressModeZ::None); TensorTileSampler dst_sampler; dst_sampler.width(n0); dst_sampler.height(m0); dst_sampler.format(TensorSamplerFormat::C_W_H); dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin); dst_sampler.address_mode_y(TensorSamplerAddressModeY::None); dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None); dst_sampler.x(x); dst_sampler.y(y); dst_sampler.z(z); dst_sampler.b(const_0); if (!dst->has_tile()) { auto &dst_tile = writer->declare_tile("dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), m0, n0)); dst->init_virtual_tensor(dst_tile, dst_sampler); } auto &dst_tile = dst->tile(); // Initialize the accumulators writer->op_assign(dst_tile, const_0); auto &rhs_z = writer->declare_tile("rhs_z", ckw::DataType::Int32); writer->op_binary_expression(rhs_z, z, BinaryOp::Mul, rhs_h); auto &k_i = writer->declare_tile("k_i", ckw::DataType::Int32); auto &k_limit = writer->declare_tile("k_limit", k - k0); auto &x_i = writer->declare_tile("x_i", ckw::DataType::Int32); writer->op_assign(x_i, const_0); writer->op_assign(k_i, const_0); // *INDENT-OFF* // clang-format off writer->op_for_loop(k_i, BinaryOp::LessEqual, k_limit, k_i, AssignmentOp::Increment, k0_tile, [&]() { //Initialize tiles // lhs_tile auto &a = writer->declare_tile("a", ckw::TileInfo(to_ckw(_lhs->data_type()), m0, k0)); // rhs_tile auto &b = writer->declare_tile("b", ckw::TileInfo(to_ckw(_rhs->data_type()), n0, k0)); writer->op_assign(a, const_0); writer->op_assign(b, const_0); // Loading the tiles // LHS lhs_sampler.x(x_i); lhs_sampler.y(y); lhs_sampler.z(z); lhs_sampler.b(const_0); writer->op_load(a, lhs->tensor(), lhs_sampler); // RHS auto &y_i = writer->declare_tile("y_i", ckw::DataType::Int32); writer->op_binary_expression(y_i, x, BinaryOp::Add, rhs_z); rhs_sampler.x(k_i); rhs_sampler.y(y_i); rhs_sampler.z(const_0); rhs_sampler.b(const_0); writer->op_load(b, rhs->tensor(), rhs_sampler); // Perform Matmul writer->op_binary_expression(dst_tile, a, BinaryOp::MatMul_Nt_T, b); writer->op_binary_expression(x_i, x_i, BinaryOp::Add, k0_tile); }); // *INDENT-ON* // clang-format on // Handling leftovers if (k % k0 != 0) { // *INDENT-OFF* // clang-format off writer->op_for_loop(k_i, BinaryOp::Less, k_tile, k_i, AssignmentOp::Increment, const_1, [&]() { //Initialize tiles // lhs_tile auto &a = writer->declare_tile("a_leftover", ckw::TileInfo(to_ckw(_lhs->data_type()), m0, 1)); // rhs_tile auto &b = writer->declare_tile("b_leftover", ckw::TileInfo(to_ckw(_rhs->data_type()), n0, 1)); writer->op_assign(a, const_0); writer->op_assign(b, const_0); // Loading the tiles // LHS lhs_sampler.x(x_i); lhs_sampler.y(y); lhs_sampler.z(z); lhs_sampler.b(const_0); writer->op_load(a, lhs->tensor(), lhs_sampler); // RHS auto &y_i = writer->declare_tile("y_i_leftover", ckw::DataType::Int32); writer->op_binary_expression(y_i, x, BinaryOp::Add, rhs_z); rhs_sampler.x(k_i); rhs_sampler.y(y_i); rhs_sampler.z(const_0); rhs_sampler.b(const_0); writer->op_load(b, rhs->tensor(), rhs_sampler); // Perform Matmul writer->op_binary_expression(dst_tile, a, BinaryOp::MatMul_Nt_T, b); writer->op_binary_expression(x_i, x_i, BinaryOp::Add, const_1); }); // *INDENT-ON* // clang-format on } } Window GpuCkwMatMul::get_window() const { ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); const int m = _dst->dimension(1); const int n = _dst->dimension(0); const bool adj_lhs = _attributes.adj_lhs(); int m0 = adj_lhs ? adjust_vec_size(_settings.m0(), m) : std::min(_settings.m0(), m); int n0 = adjust_vec_size(_settings.n0(), n); // Configure kernel window Window win = calculate_max_window(_dst->tensor_shape(), Steps(n0, m0)); win = win.collapse(win, Window::DimZ); return win; } std::string GpuCkwMatMul::get_name(const ComponentGroup &comp_group) const { ARM_COMPUTE_UNUSED(comp_group); std::string kernel_name("mat_mul_native"); const int m = _dst->dimension(1); const int n = _dst->dimension(0); const int k = _attributes.adj_lhs() ? _lhs->tensor_shape().y() : _lhs->tensor_shape().x(); kernel_name += _attributes.adj_lhs() ? "_t" : "_nt"; kernel_name += _attributes.adj_rhs() ? "_t" : "_nt"; kernel_name += "_"; kernel_name += support::cpp11::to_string(m); kernel_name += "_"; kernel_name += support::cpp11::to_string(n); kernel_name += "_"; kernel_name += support::cpp11::to_string(k); kernel_name += "_"; kernel_name += support::cpp11::to_string(_dst->dimension(2)); kernel_name += "_"; kernel_name += support::cpp11::to_string(_settings.m0()); kernel_name += "_"; kernel_name += support::cpp11::to_string(_settings.n0()); kernel_name += "_"; kernel_name += support::cpp11::to_string(_settings.k0()); return kernel_name; } } // namespace dynamic_fusion } // namespace experimental } // namespace arm_compute