aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdnan AlSinan <adnan.alsinan@arm.com>2023-10-24 12:03:21 +0100
committerAdnan AlSinan <adnan.alsinan@arm.com>2023-10-31 11:00:45 +0000
commitfde45d836cf753a94915ac42d8a13da7edc52221 (patch)
tree6ed787749aa3caec13a0b3c2c64ea591b423089c
parent5ef0bdd53dd2ce6bc7ad28077ffac3bf9e939b5f (diff)
downloadComputeLibrary-fde45d836cf753a94915ac42d8a13da7edc52221.tar.gz
Extend CKW MatMul with nt_t
- Add the kernel variant: (nt_t) to GpuCKWMatMul. - Extend CKW MatMul validation test with nt_t. - Fixes a bug in CKW where z-dim = 1. Resolves: COMPMID-6435 Signed-off-by: Adnan AlSinan <adnan.alsinan@arm.com> Change-Id: I4c5e8791e55f21ffff3c11eca7802c51a4259977 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10525 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--compute_kernel_writer/prototype/src/Prototype.h2
-rw-r--r--docs/user_guide/release_version_and_change_log.dox1
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp220
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp20
-rw-r--r--src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp2
-rw-r--r--tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp350
-rw-r--r--tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h267
7 files changed, 846 insertions, 16 deletions
diff --git a/compute_kernel_writer/prototype/src/Prototype.h b/compute_kernel_writer/prototype/src/Prototype.h
index 433eef9e7b..b392fe2651 100644
--- a/compute_kernel_writer/prototype/src/Prototype.h
+++ b/compute_kernel_writer/prototype/src/Prototype.h
@@ -3050,7 +3050,7 @@ private:
address += " * ";
address += stride_y;
}
- if (z != "0" && (_mapper.is_one_component_z() != true))
+ if (z != "0")
{
const std::string stride_z = _mapper.tensor_component_stride_z();
address += " + (";
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 2b8f5d87a1..9e67d7e6b4 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -49,6 +49,7 @@ v23.11 Public major release
- @ref experimental::dynamic_fusion::GpuCkwResize
- @ref experimental::dynamic_fusion::GpuCkwPool2d
- @ref experimental::dynamic_fusion::GpuCkwDepthwiseConv2d
+ - @ref experimental::dynamic_fusion::GpuCkwMatMul
- Add support for OpenCLâ„¢ comand buffer with mutable dispatch extension.
- Update OpenCLâ„¢ API headers to v2023.04.17.
- Remove legacy PostOps interface. PostOps was the experimental interface for kernel fusion and is replaced by the new Dynamic Fusion interface.
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp
index 77e5f7af01..9beba03598 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp
@@ -24,9 +24,18 @@
#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "support/StringSupport.h"
+using namespace ckw;
namespace arm_compute
{
namespace experimental
@@ -50,20 +59,225 @@ void GpuCkwMatMul::write_component_code(const ComponentGroup &comp_group,
GpuCkwVariableTable &vtable,
GpuCkwScopedKernelWriter writer) const
{
- ARM_COMPUTE_UNUSED(comp_group, vtable, writer);
+ const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+ GpuCkwComponentArgument *lhs =
+ vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs");
+ GpuCkwComponentArgument *rhs =
+ vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs");
+ GpuCkwComponentArgument *dst =
+ vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+
+ // Constants
+ const int height_idx = get_data_layout_dimension_index(_lhs->data_layout(), DataLayoutDimension::HEIGHT);
+ const auto &rhs_h = writer->declare_tile("rhs_h", static_cast<int32_t>(_rhs->dimension(height_idx)));
+ const int m = static_cast<int>(_dst->dimension(1));
+ const int n = static_cast<int>(_dst->dimension(0));
+ const int k =
+ _attributes.adj_lhs() ? static_cast<int>(_lhs->tensor_shape().y()) : static_cast<int>(_lhs->tensor_shape().x());
+ const int m0 = root_window.y().step();
+ const int n0 = root_window.x().step();
+ const int k0 = _settings.k0();
+ const int partial_store_m0 = m % m0;
+ const int partial_store_n0 = n % n0;
+
+ const auto &const_1 = writer->declare_tile("1", 1);
+ auto &const_0 = writer->declare_tile("0", 0);
+ auto &k0_tile = writer->declare_tile("k0", k0);
+ auto &k_tile = writer->declare_tile("k", k);
+
+ auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
+ auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
+ auto &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
+
+ writer->op_get_global_id(gid_0, 0);
+ writer->op_get_global_id(gid_1, 1);
+ writer->op_get_global_id(gid_2, 2);
+
+ auto &x = writer->declare_tile("x", ckw::DataType::Int32);
+ auto &y = writer->declare_tile("y", ckw::DataType::Int32);
+ auto &z = writer->declare_tile("z", ckw::DataType::Int32);
+
+ get_coord(writer, x, gid_0, n0, partial_store_n0, "gid_x_", const_0);
+ get_coord(writer, y, gid_1, m0, partial_store_m0, "gid_y_", const_0);
+ get_coord(writer, z, gid_2, 1, 0, "gid_z_", const_0);
+
+ TensorTileSampler lhs_sampler;
+ lhs_sampler.height(m0);
+ lhs_sampler.width(k0);
+ lhs_sampler.format(TensorSamplerFormat::C_W_H);
+ lhs_sampler.address_mode_x(TensorSamplerAddressModeX::None);
+ lhs_sampler.address_mode_y(TensorSamplerAddressModeY::None);
+ lhs_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+
+ TensorTileSampler rhs_sampler;
+ rhs_sampler.height(k0);
+ rhs_sampler.width(n0);
+ rhs_sampler.format(TensorSamplerFormat::C_WH_1);
+ rhs_sampler.address_mode_x(TensorSamplerAddressModeX::None);
+ rhs_sampler.address_mode_y(TensorSamplerAddressModeY::None);
+ rhs_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+
+ TensorTileSampler dst_sampler;
+ dst_sampler.width(n0);
+ dst_sampler.height(m0);
+ dst_sampler.format(TensorSamplerFormat::C_W_H);
+ dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin);
+ dst_sampler.address_mode_y(TensorSamplerAddressModeY::None);
+ dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+ dst_sampler.x(x);
+ dst_sampler.y(y);
+ dst_sampler.z(z);
+ dst_sampler.b(const_0);
+
+ if (!dst->has_tile())
+ {
+ auto &dst_tile = writer->declare_tile("dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), m0, n0));
+ dst->init_virtual_tensor(dst_tile, dst_sampler);
+ }
+ auto &dst_tile = dst->tile();
+
+ // Initialize the accumulators
+ writer->op_assign(dst_tile, const_0);
+
+ auto &rhs_z = writer->declare_tile("rhs_z", ckw::DataType::Int32);
+ writer->op_binary_expression(rhs_z, z, BinaryOp::Mul, rhs_h);
+
+ auto &k_i = writer->declare_tile("k_i", ckw::DataType::Int32);
+ auto &k_limit = writer->declare_tile("k_limit", k - k0);
+
+ auto &x_i = writer->declare_tile("x_i", ckw::DataType::Int32);
+ writer->op_assign(x_i, const_0);
+
+ writer->op_assign(k_i, const_0);
+
+ // *INDENT-OFF*
+ // clang-format off
+ writer->op_for_loop(k_i, BinaryOp::LessEqual, k_limit, k_i, AssignmentOp::Increment, k0_tile,
+ [&]()
+ {
+ //Initialize tiles
+ // lhs_tile
+ auto &a = writer->declare_tile("a", ckw::TileInfo(to_ckw(_lhs->data_type()), m0, k0));
+ // rhs_tile
+ auto &b = writer->declare_tile("b", ckw::TileInfo(to_ckw(_rhs->data_type()), n0, k0));
+ writer->op_assign(a, const_0);
+ writer->op_assign(b, const_0);
+
+ // Loading the tiles
+ // LHS
+ lhs_sampler.x(x_i);
+ lhs_sampler.y(y);
+ lhs_sampler.z(z);
+ lhs_sampler.b(const_0);
+ writer->op_load(a, lhs->tensor(), lhs_sampler);
+
+ // RHS
+ auto &y_i = writer->declare_tile("y_i", ckw::DataType::Int32);
+ writer->op_binary_expression(y_i, x, BinaryOp::Add, rhs_z);
+ rhs_sampler.x(k_i);
+ rhs_sampler.y(y_i);
+ rhs_sampler.z(const_0);
+ rhs_sampler.b(const_0);
+ writer->op_load(b, rhs->tensor(), rhs_sampler);
+
+ // Perform Matmul
+ writer->op_binary_expression(dst_tile, a, BinaryOp::MatMul_Nt_T, b);
+ writer->op_binary_expression(x_i, x_i, BinaryOp::Add, k0_tile);
+ });
+// *INDENT-ON*
+ // clang-format on
+
+ // Handling leftovers
+ if (k % k0 != 0)
+ {
+ // *INDENT-OFF*
+ // clang-format off
+ writer->op_for_loop(k_i, BinaryOp::Less, k_tile, k_i, AssignmentOp::Increment, const_1,
+ [&]()
+ {
+ //Initialize tiles
+ // lhs_tile
+ auto &a =
+ writer->declare_tile("a_leftover", ckw::TileInfo(to_ckw(_lhs->data_type()), m0, 1));
+ // rhs_tile
+ auto &b =
+ writer->declare_tile("b_leftover", ckw::TileInfo(to_ckw(_rhs->data_type()), n0, 1));
+ writer->op_assign(a, const_0);
+ writer->op_assign(b, const_0);
+
+ // Loading the tiles
+ // LHS
+ lhs_sampler.x(x_i);
+ lhs_sampler.y(y);
+ lhs_sampler.z(z);
+ lhs_sampler.b(const_0);
+ writer->op_load(a, lhs->tensor(), lhs_sampler);
+
+ // RHS
+ auto &y_i = writer->declare_tile("y_i_leftover", ckw::DataType::Int32);
+ writer->op_binary_expression(y_i, x, BinaryOp::Add, rhs_z);
+ rhs_sampler.x(k_i);
+ rhs_sampler.y(y_i);
+ rhs_sampler.z(const_0);
+ rhs_sampler.b(const_0);
+ writer->op_load(b, rhs->tensor(), rhs_sampler);
+
+ // Perform Matmul
+ writer->op_binary_expression(dst_tile, a, BinaryOp::MatMul_Nt_T, b);
+ writer->op_binary_expression(x_i, x_i, BinaryOp::Add, const_1);
+ });
+// *INDENT-ON*
+ // clang-format on
+ }
}
Window GpuCkwMatMul::get_window() const
{
ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
- return Window();
+
+ const int m = _dst->dimension(1);
+ const int n = _dst->dimension(0);
+ const bool adj_lhs = _attributes.adj_lhs();
+
+ int m0 = adj_lhs ? adjust_vec_size(_settings.m0(), m) : std::min(_settings.m0(), m);
+ int n0 = adjust_vec_size(_settings.n0(), n);
+
+ // Configure kernel window
+ Window win = calculate_max_window(_dst->tensor_shape(), Steps(n0, m0));
+ win = win.collapse(win, Window::DimZ);
+
+ return win;
}
std::string GpuCkwMatMul::get_name(const ComponentGroup &comp_group) const
{
ARM_COMPUTE_UNUSED(comp_group);
- return "MatMul";
+ std::string kernel_name("mat_mul_native");
+
+ const int m = _dst->dimension(1);
+ const int n = _dst->dimension(0);
+ const int k = _attributes.adj_lhs() ? _lhs->tensor_shape().y() : _lhs->tensor_shape().x();
+
+ kernel_name += _attributes.adj_lhs() ? "_t" : "_nt";
+ kernel_name += _attributes.adj_rhs() ? "_t" : "_nt";
+ kernel_name += "_";
+ kernel_name += support::cpp11::to_string(m);
+ kernel_name += "_";
+ kernel_name += support::cpp11::to_string(n);
+ kernel_name += "_";
+ kernel_name += support::cpp11::to_string(k);
+ kernel_name += "_";
+ kernel_name += support::cpp11::to_string(_dst->dimension(2));
+ kernel_name += "_";
+ kernel_name += support::cpp11::to_string(_settings.m0());
+ kernel_name += "_";
+ kernel_name += support::cpp11::to_string(_settings.n0());
+ kernel_name += "_";
+ kernel_name += support::cpp11::to_string(_settings.k0());
+
+ return kernel_name;
}
} // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp
index eada61e1b3..f238d42d98 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp
@@ -91,14 +91,16 @@ Status ClComponentMatMul::validate(const Properties &properties,
const auto rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1);
const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+ // Currently, the only supported case is when adj_lhs = false and adj_rhs = true
+ ARM_COMPUTE_RETURN_ERROR_ON((attributes.adj_lhs() != false) && (attributes.adj_rhs() != true));
+
// Check if Matching data type
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
// Data type
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
- // Data layout
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(lhs, DataLayout::NHWC);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
// All tensor infos are initialized
ARM_COMPUTE_RETURN_ERROR_ON(lhs->tensor_shape().total_size() == 0);
@@ -108,20 +110,18 @@ Status ClComponentMatMul::validate(const Properties &properties,
// Device requirements are met
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(lhs);
- // Check if dst shape is correct
+ // Check if block sizes are supported
MatMulKernelInfo matmul_kernel_info =
MatMulKernelInfo(attributes.adj_lhs(), attributes.adj_rhs(), settings.m0(), settings.n0(), settings.k0());
- const auto expected_dst_shape =
- misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), expected_dst_shape);
-
- // Check if block sizes are supported
ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(attributes, settings));
-
ARM_COMPUTE_RETURN_ON_ERROR(
opencl::kernels::validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+ // Check if dst shape is correct
+ const auto expected_dst_shape =
+ misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), expected_dst_shape);
+
return Status{};
}
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp
index ee27b5ea47..e24629a036 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp
@@ -87,8 +87,6 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
// Check support level
// Data type
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
- // Data layout
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(lhs, DataLayout::NHWC);
// Check components
if (context.gpu_language() == GpuLanguage::OpenCL)
diff --git a/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp b/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp
new file mode 100644
index 0000000000..38c3a0ca0e
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
+#include "tests/AssetsLibrary.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/datasets/LargeMatMulDataset.h"
+#include "tests/datasets/SmallMatMulDataset.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/reference/Permute.h"
+#include "tests/validation/reference/GEMM.h"
+
+#include "tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h"
+
+#include <tuple>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+ RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+constexpr float abs_tolerance_f32(
+ 0.0001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for floating point data types in case using relative tolerance fails because of small values */
+constexpr float abs_tolerance_f16(
+ 0.001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp16 data types in case using relative tolerance fails because of small values */
+ RelativeTolerance<half_float::half> tolerance_f16(half(0.02)); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+}
+
+/** M0 values to test --precommit*/
+const auto m0_values_precommit = framework::dataset::make("M0", { 1, 3 });
+
+/** N0 values to test --precommit*/
+const auto n0_values_precommit = framework::dataset::make("N0", { 1, 2, 4 });
+
+/** K0 values to test --precommit*/
+const auto k0_values_precommit = framework::dataset::make("K0", { 1, 2, 3 });
+
+/** M0 values to test --nightly*/
+const auto m0_values_nightly_lhs_nt = framework::dataset::make("M0", { 1, 2, 3, 4, 5, 6, 7, 8 });
+const auto m0_values_nightly_lhs_t = framework::dataset::make("M0", { 1, 2, 3, 4, 8 });
+
+/** N0 values to test --nightly*/
+const auto n0_values_nightly_rhs_nt = framework::dataset::make("N0", { 1, 2, 3, 4, 8, 16 });
+const auto n0_values_nightly_rhs_t = framework::dataset::make("N0", { 1, 2, 3, 4, 8 });
+
+/** K0 values to test --nightly*/
+const auto k0_values_nightly_lhs_nt_rhs_nt = framework::dataset::make("K0", { 1, 2, 3, 4, 8, 16 });
+const auto k0_values_nightly_rhs_t = framework::dataset::make("K0", { 1, 2, 3, 4, 8 });
+const auto k0_values_nightly_lhs_t_rhs_nt = framework::dataset::make("K0", { 1, 2, 3, 4, 5, 6, 7, 8 });
+
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+
+TEST_SUITE(MatMul)
+
+TEST_SUITE(Validate)
+TEST_CASE(SupportedBlockSizes, framework::DatasetMode::ALL)
+{
+ using MatMulConfigurationPair = std::pair<MatMulKernelInfo, bool>;
+
+ const std::vector<MatMulConfigurationPair> supported_block_sizes =
+ {
+ // MatMulKernelInfo(adj_lhs, adj_rhs, M0, N0, K0, export_rhs_to_cl_image = false)
+
+ // Lhs not-transposed, Rhs transposed
+ { MatMulKernelInfo(false, true, 0, 1, 1), false }, // M0 should be > 0
+ { MatMulKernelInfo(false, true, 3, 11, 1), false }, // N0 not in {1, 2, 3, 4, 8, 16}
+ { MatMulKernelInfo(false, true, 3, 7, 1), false }, // N0 not in {1, 2, 3, 4, 8, 16}
+ { MatMulKernelInfo(false, true, 3, 3, 12), false }, // K0 not in {1, 2, 3, 4, 8, 16}
+ { MatMulKernelInfo(false, true, 3, 3, 6), false }, // K0 not in {1, 2, 3, 4, 8, 16}
+ { MatMulKernelInfo(false, true, 5, 1, 2), true },
+ { MatMulKernelInfo(false, true, 3, 3, 3), true },
+ { MatMulKernelInfo(false, true, 2, 4, 8), true },
+
+ };
+
+ // Create a new workload sketch
+ auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+ auto context = GpuWorkloadContext{ &cl_compile_ctx };
+ GpuWorkloadSketch sketch{ &context };
+
+ // Set big enough shapes so that block sizes are not truncated. Also, set all dimensions equal
+ // so that it doesn't fail for different NT/T configurations. We aim to test the block sizes here,
+ // not the shapes themselves.
+ const TensorInfo lhs_info = context.create_tensor_info(TensorInfo(TensorShape(100U, 100U), 1, DataType::F32));
+ const TensorInfo rhs_info = context.create_tensor_info(TensorInfo(TensorShape(100U, 100U), 1, DataType::F32));
+
+ for(auto &pair : supported_block_sizes)
+ {
+ MatMulAttributes matmul_attr {};
+ matmul_attr.adj_lhs(pair.first.adj_lhs);
+ matmul_attr.adj_rhs(pair.first.adj_rhs);
+
+ GpuMatMulSettings matmul_settings {};
+ matmul_settings.m0(pair.first.m0);
+ matmul_settings.n0(pair.first.n0);
+ matmul_settings.k0(pair.first.k0);
+
+ Status status = GpuMatMul::validate_op(sketch, &lhs_info, &rhs_info, matmul_attr, matmul_settings);
+ ARM_COMPUTE_EXPECT(bool(status) == pair.second, framework::LogLevel::ERRORS);
+ }
+}
+
+TEST_CASE(ValidateInputShapes, framework::DatasetMode::ALL)
+{
+ // Create a sketch
+ auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+ auto context = GpuWorkloadContext{ &cl_compile_ctx };
+ GpuWorkloadSketch sketch{ &context };
+
+ // Configurations are assumed to be Nt/Nt, but will be transposed inside the test to test other configurations
+ using ShapeConfigurationTuple = std::tuple<TensorShape, TensorShape, bool>;
+ const std::vector<ShapeConfigurationTuple> shape_configurations =
+ {
+ { TensorShape(5U, 1U), TensorShape(3U, 5U), true },
+ { TensorShape(10U, 12U), TensorShape(3U, 10U), true },
+ { TensorShape(8U, 4U), TensorShape(2U, 8U), true },
+ { TensorShape(8U, 4U), TensorShape(2U, 5U), false }, // Mismatch in the K dimension
+ { TensorShape(5U, 0U), TensorShape(2U, 5U), false }, // Invalid dimension
+ { TensorShape(5U, 4U, 3U, 4U, 5U, 6U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), true },
+ { TensorShape(5U, 4U, 3U, 4U, 5U, 1U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), false }, // no batch broadcasting
+ { TensorShape(5U, 4U, 3U, 4U, 9U, 6U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), false }, // mismatch in batch dimension
+ };
+
+ for(auto &tuple : shape_configurations)
+ {
+ const bool expected = std::get<2>(tuple);
+
+ for(bool adj_lhs :
+ {
+ false
+ })
+ {
+ for(bool adj_rhs :
+ {
+ true
+ })
+ {
+ TensorShape lhs_shape = std::get<0>(tuple);
+ TensorShape rhs_shape = std::get<1>(tuple);
+
+ if(adj_lhs)
+ {
+ permute(lhs_shape, PermutationVector(1U, 0U));
+ }
+
+ if(adj_rhs)
+ {
+ permute(rhs_shape, PermutationVector(1U, 0U));
+ }
+
+ const TensorInfo lhs_info = context.create_tensor_info(TensorInfo(lhs_shape, 1, DataType::F32));
+ const TensorInfo rhs_info = context.create_tensor_info(TensorInfo(rhs_shape, 1, DataType::F32));
+
+ MatMulAttributes matmul_attr {};
+ matmul_attr.adj_lhs(adj_lhs);
+ matmul_attr.adj_rhs(adj_rhs);
+
+ GpuMatMulSettings matmul_settings {};
+ matmul_settings.m0(1);
+ matmul_settings.n0(1);
+ matmul_settings.k0(1);
+
+ Status status = GpuMatMul::validate_op(sketch, &lhs_info, &rhs_info, matmul_attr, matmul_settings);
+ ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+ }
+ }
+ }
+}
+
+
+TEST_CASE(ValidateDataTypes, framework::DatasetMode::ALL)
+{
+ // Configurations are assumed to be Nt/Nt, but will be transposed inside the test to test other configurations
+ using DataTypeConfigurationTuple = std::tuple<DataType, DataType, DataType, bool>;
+ const std::vector<DataTypeConfigurationTuple> data_type_configurations =
+ {
+ { DataType::F32, DataType::F32, DataType::F32, true },
+ { DataType::F16, DataType::F16, DataType::F16, true },
+ { DataType::F16, DataType::F32, DataType::F32, false }, // no mixed precision
+ { DataType::F64, DataType::F64, DataType::F64, false }, // no double precision
+ { DataType::QASYMM8, DataType::QASYMM8, DataType::QASYMM8, false }, // no quantized types
+ { DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, DataType::QASYMM8_SIGNED, false }, // no quantized types
+ { DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8_PER_CHANNEL, false }, // no quantized types
+ { DataType::QASYMM16, DataType::QASYMM16, DataType::QASYMM16, false }, // no quantized types
+ { DataType::QSYMM16, DataType::QSYMM16, DataType::QSYMM16, false }, // no quantized types
+ { DataType::QSYMM8, DataType::QSYMM8, DataType::QSYMM8, false }, // no quantized types
+ { DataType::S64, DataType::S64, DataType::S64, false }, // no integral types
+ { DataType::S32, DataType::S32, DataType::S32, false }, // no integral types
+ { DataType::S16, DataType::S16, DataType::S16, false }, // no integral types
+ { DataType::S8, DataType::S8, DataType::S8, false }, // no integral types
+ { DataType::U64, DataType::U64, DataType::U64, false }, // no integral types
+ { DataType::U32, DataType::U32, DataType::U32, false }, // no integral types
+ { DataType::U16, DataType::U16, DataType::U16, false }, // no integral types
+ { DataType::U8, DataType::U8, DataType::U8, false }, // no integral types
+ };
+ // Create a sketch
+ auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+ auto context = GpuWorkloadContext{ &cl_compile_ctx };
+ GpuWorkloadSketch sketch{ &context };
+
+ const TensorShape shape = TensorShape(10U, 10U);
+ MatMulAttributes matmul_attr {};
+ matmul_attr.adj_lhs(false);
+ matmul_attr.adj_rhs(false);
+ GpuMatMulSettings matmul_settings {};
+ matmul_settings.m0(1);
+ matmul_settings.n0(1);
+ matmul_settings.k0(1);
+
+ for(auto &tuple : data_type_configurations)
+ {
+ const bool expected = std::get<3>(tuple);
+
+ const TensorInfo lhs_info = context.create_tensor_info(TensorInfo(shape, 1, std::get<0>(tuple)));
+ const TensorInfo rhs_info = context.create_tensor_info(TensorInfo(shape, 1, std::get<1>(tuple)));
+
+ Status status = GpuMatMul::validate_op(sketch, &lhs_info, &rhs_info, matmul_attr, matmul_settings);
+ ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+ }
+}
+
+TEST_SUITE_END() // Validate
+
+template <typename T>
+using DynamicFusionGpuMatmulFixture = DynamicFusionGpuMatMulValidationFixture<CLTensor, CLAccessor,GpuMatMul, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+
+FIXTURE_DATA_TEST_CASE(RunTiny, DynamicFusionGpuMatmulFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::TinyMatMulDataset(),
+ framework::dataset::make("TransposeA", { false })),
+ framework::dataset::make("TransposeB", { true })),
+ m0_values_precommit),
+ n0_values_precommit),
+ k0_values_precommit),
+ framework::dataset::make("ExportRhsToCLImage", { false })),
+ framework::dataset::make("DataType", DataType::F32)))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuMatmulFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulDataset(),
+ framework::dataset::make("TransposeA", { false })),
+ framework::dataset::make("TransposeB", { true })),
+ m0_values_precommit),
+ n0_values_precommit),
+ k0_values_precommit),
+ framework::dataset::make("ExportRhsToCLImage", { false })),
+ framework::dataset::make("DataType", DataType::F32)))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeRhsTransposed, DynamicFusionGpuMatmulFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+ framework::dataset::make("TransposeA", { false })),
+ framework::dataset::make("TransposeB", { true })),
+ m0_values_nightly_lhs_nt),
+ n0_values_nightly_rhs_t),
+ k0_values_nightly_rhs_t),
+ framework::dataset::make("ExportRhsToCLImage", { false })),
+ framework::dataset::make("DataType", DataType::F32)))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+// Running High Dimensional test is enough for FP32, because we're stressing the number of dimensions, not data type or M0/N0/K0
+FIXTURE_DATA_TEST_CASE(RunHighDimensional, DynamicFusionGpuMatmulFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::HighDimensionalMatMulDataset(),
+ framework::dataset::make("TransposeA", { false })),
+ framework::dataset::make("TransposeB", { true })),
+ framework::dataset::make("M0", { 2 })),
+ framework::dataset::make("N0", { 2 })),
+ framework::dataset::make("K0", { 2 })),
+ framework::dataset::make("ExportRhsToCLImage", { false })),
+ framework::dataset::make("DataType", DataType::F32)))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
+}
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuMatmulFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulDataset(),
+ framework::dataset::make("TransposeA", { false })),
+ framework::dataset::make("TransposeB", { true })),
+ m0_values_precommit),
+ n0_values_precommit),
+ k0_values_precommit),
+ framework::dataset::make("ExportRhsToCLImage", { false })),
+ framework::dataset::make("DataType", DataType::F16)))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+
+FIXTURE_DATA_TEST_CASE(RunLargeRhsTransposed, DynamicFusionGpuMatmulFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
+ framework::dataset::make("TransposeA", { false })),
+ framework::dataset::make("TransposeB", { true })),
+ m0_values_nightly_lhs_nt),
+ n0_values_nightly_rhs_t),
+ k0_values_nightly_rhs_t),
+ framework::dataset::make("ExportRhsToCLImage", { false })),
+ framework::dataset::make("DataType", DataType::F16)))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+TEST_SUITE_END() // FP16
+
+TEST_SUITE_END() // Float
+TEST_SUITE_END() // MatMul
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h b/tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h
new file mode 100644
index 0000000000..c6ac4b91db
--- /dev/null
+++ b/tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_MATMULKERNELFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_MATMULKERNELFIXTURE_H
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMatMul.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/Helpers.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/reference/GEMM.h"
+#include "tests/validation/reference/Permute.h"
+#include "tests/validation/reference/ReshapeLayer.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+template <typename U>
+void fill(U &&tensor, int i)
+{
+ switch(tensor.data_type())
+ {
+ case DataType::F16:
+ {
+ arm_compute::utils::uniform_real_distribution_16bit<half> distribution{ -1.0f, 1.0f };
+ library->fill(tensor, distribution, i);
+ break;
+ }
+ case DataType::F32:
+ {
+ std::uniform_real_distribution<float> distribution(-1.0f, 1.0f);
+ library->fill(tensor, distribution, i);
+ break;
+ }
+ default:
+ library->fill_tensor_uniform(tensor, i);
+ }
+}
+
+} // namespace
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuMatMulValidationGenericFixture : public framework::Fixture
+{
+
+public:
+ void setup(TensorShape lhs_shape, TensorShape rhs_shape, TensorShape output_shape, bool transpose_a, bool transpose_b,
+ int M0, int N0, int K0, bool export_rhs_to_cl_image, DataType data_type)
+ {
+ //For brevity, the input shapes are assumed to be not-transposed for both a and b matrices.
+ if(transpose_a)
+ {
+ permute(lhs_shape, PermutationVector(1U, 0U));
+ }
+ if(transpose_b)
+ {
+ permute(rhs_shape, PermutationVector(1U, 0U));
+ }
+
+ // Skip configurations unsupported by the device.
+ _device_supports_export_to_cl_image = image2d_from_buffer_supported(CLKernelLibrary::get().get_device());
+ if(!_device_supports_export_to_cl_image && export_rhs_to_cl_image)
+ {
+ ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+ framework::ARM_COMPUTE_PRINT_INFO();
+ return; // Note: Also need to skip the validate in corresponding FIXTURE_DATA_TEST_CASEs.
+ }
+
+ _target = compute_target(lhs_shape, rhs_shape, transpose_a, transpose_b, M0, N0, K0, export_rhs_to_cl_image, data_type);
+ _reference = compute_reference(lhs_shape, rhs_shape, output_shape, transpose_a, transpose_b, data_type);
+ }
+
+protected:
+ TensorType compute_target(TensorShape &shape_a, TensorShape &shape_b, bool transpose_a, bool transpose_b, int M0, int N0, int K0, bool export_rhs_to_cl_image, DataType data_type)
+ {
+ ARM_COMPUTE_UNUSED(export_rhs_to_cl_image);
+ CLScheduler::get().default_reinit();
+
+ // Create a new workload sketch
+ auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+ auto context = GpuWorkloadContext{ &cl_compile_ctx };
+ GpuWorkloadSketch sketch{ &context };
+
+ // Create sketch tensors
+ TensorInfo lhs_info = context.create_tensor_info(TensorInfo(shape_a, 1, data_type));
+ TensorInfo rhs_info = context.create_tensor_info(TensorInfo(shape_b, 1, data_type));
+ TensorInfo dst_info = context.create_tensor_info();
+
+ MatMulAttributes matmul_attr {};
+ matmul_attr.adj_lhs(transpose_a);
+ matmul_attr.adj_rhs(transpose_b);
+
+ GpuMatMulSettings matmul_settings {};
+ matmul_settings.m0(M0);
+ matmul_settings.n0(N0);
+ matmul_settings.k0(K0);
+
+ ITensorInfo *ans_info = FunctionType::create_op(sketch, &lhs_info, &rhs_info, matmul_attr, matmul_settings);
+ GpuOutput::create_op(sketch, ans_info, &dst_info);
+
+ // Configure runtime
+ ClWorkloadRuntime runtime;
+ runtime.configure(sketch);
+
+ for(auto &data : runtime.get_auxiliary_tensors())
+ {
+ CLTensor *tensor = std::get<0>(data);
+ TensorInfo info = std::get<1>(data);
+ AuxMemoryInfo aux_mem_req = std::get<2>(data);
+ tensor->allocator()->init(info, aux_mem_req.alignment);
+ tensor->allocator()->allocate(); // Use ACL allocated memory
+ }
+
+ // Construct user tensors
+ TensorType t_lhs{};
+ TensorType t_rhs{};
+ TensorType t_dst{};
+
+ // Initialize user tensors
+ t_lhs.allocator()->init(lhs_info);
+ t_rhs.allocator()->init(rhs_info);
+ t_dst.allocator()->init(dst_info);
+
+ ARM_COMPUTE_ASSERT(t_lhs.info()->is_resizable());
+ ARM_COMPUTE_ASSERT(t_rhs.info()->is_resizable());
+ ARM_COMPUTE_ASSERT(t_dst.info()->is_resizable());
+
+ // Allocate and fill user tensors
+ t_lhs.allocator()->allocate();
+ t_rhs.allocator()->allocate();
+ t_dst.allocator()->allocate();
+
+ ARM_COMPUTE_ASSERT(!t_lhs.info()->is_resizable());
+ ARM_COMPUTE_ASSERT(!t_rhs.info()->is_resizable());
+ ARM_COMPUTE_ASSERT(!t_dst.info()->is_resizable());
+
+ fill(AccessorType(t_lhs), 0);
+ fill(AccessorType(t_rhs), 1);
+
+ // Run runtime
+ runtime.run({ &t_lhs, &t_rhs, &t_dst });
+
+ return t_dst;
+ }
+
+ SimpleTensor<T> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &output_shape, bool pretranspose_a, bool pretranspose_b, DataType data_type)
+ {
+ // We collapse dimensions > 3 onto dimension 3, i.e. 5D+ tensors will look like 4D
+ // This is necessary unless we choose to extend gemm reference for 5D+ tensors
+ TensorShape output_shape_collapsed = output_shape.collapsed_from(Window::DimZ);
+ TensorShape shape_a_collapsed = shape_a.collapsed_from(Window::DimZ);
+ TensorShape shape_b_collapsed = shape_b.collapsed_from(Window::DimZ);
+
+ // Create reference
+ SimpleTensor<T> a{ shape_a_collapsed, data_type, 1 };
+ SimpleTensor<T> b{ shape_b_collapsed, data_type, 1 };
+ SimpleTensor<T> c{ output_shape_collapsed, data_type, 1 };
+
+ // Fill reference
+ fill(a, 0);
+ fill(b, 1);
+
+ /* Note: Assuming the usual batch matmul dimensions A = (B x M x K), B = (B x K x N), if pretranspose_A is set to true, then A is assumed to be (B x K x M),
+ therefore, A must be pre-transposed before passing it to the fixture. And, we transpose A again in the fixture to make it (B x M x K)
+ in order to be able to call reference implementation that works with (B x M x K) input.
+ Similarly, if pretranspose_B is set to true, then B is assumed to be (B x N x K), B must be pre-transposed before passing it to the fixture. */
+
+ // Define transposed shapes
+ TensorShape a_transposed_shape(a.shape());
+ a_transposed_shape.set(0, a.shape().y());
+ a_transposed_shape.set(1, a.shape().x());
+
+ TensorShape b_transposed_shape(b.shape());
+ b_transposed_shape.set(0, b.shape().y());
+ b_transposed_shape.set(1, b.shape().x());
+
+ // Define transposed tensors
+ SimpleTensor<T> a_transposed{ a_transposed_shape, data_type };
+ SimpleTensor<T> b_transposed{ b_transposed_shape, data_type };
+
+ //pretranspose a if necessary
+ if(pretranspose_a)
+ {
+ a_transposed = reference::permute<T>(a, PermutationVector(1U, 0U));
+ }
+
+ // pretranspose b if necessary
+ if(pretranspose_b)
+ {
+ b_transposed = reference::permute<T>(b, PermutationVector(1U, 0U));
+ }
+
+ // Use transposed tensors if boolean enabled else use original tensors
+ SimpleTensor<T> result = reference::gemm<T>((pretranspose_a) ? a_transposed : a, (pretranspose_b) ? b_transposed : b, c, 1.0f, 0.f);
+
+
+ // We reshape the gemm output back if the tensor is high dimensional
+ if(output_shape_collapsed != output_shape)
+ {
+ // std::cout << "called reshape: \n";
+ result = reference::reshape_layer(result, output_shape);
+ }
+
+ return result;
+ }
+
+ CLTensor _target{};
+ SimpleTensor<T> _reference{};
+ bool _device_supports_export_to_cl_image{ false };
+ bool _device_supports_mmul{ false };
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuMatMulValidationFixture : public DynamicFusionGpuMatMulValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+ public:
+ void setup(TensorShape lhs_shape, TensorShape rhs_shape, TensorShape output_shape, bool transpose_a, bool transpose_b,
+ int M0, int N0, int K0, bool export_rhs_to_cl_image, DataType data_type)
+ {
+ ARM_COMPUTE_UNUSED(export_rhs_to_cl_image);
+ DynamicFusionGpuMatMulValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(lhs_shape, rhs_shape, output_shape, transpose_a, transpose_b, M0,
+ N0, K0, false /* export_rhs_to_cl_image bias */, data_type);
+ }
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_MATMULKERNELFIXTURE_H