Implement Quantized MatMul kernel using MMUL extension

Resolves: COMPMID-6475 Change-Id: Ic867cdfff5d4391cb749a04bf7cc35cda63d3b71 Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10311 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: Gunes Bayir <gunes.bayir@arm.com> 2023-09-13 11:59:34 +0100
committer: Gunes Bayir <gunes.bayir@arm.com> 2023-09-18 13:51:15 +0000
commit: a116cd3676796412cd4d9318a6cc1c1eef4c093c (patch)
tree: 21788d6776e7a0808d0f6d6c1bef452cfb2c7f27 /tests
parent: 40a9d3ea62d7dfed3fb42b5bc5c2ee5272fd89bf (diff)
download: ComputeLibrary-a116cd3676796412cd4d9318a6cc1c1eef4c093c.tar.gz
2 files changed, 261 insertions, 24 deletions
diff --git a/tests/datasets/MatMulLowpMMULDataset.h b/tests/datasets/MatMulLowpMMULDataset.h
new file mode 100644
index 0000000000..1b22e1061f
--- /dev/null
+++ b/tests/datasets/MatMulLowpMMULDataset.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_TESTS_DATASETS_MATMULLOWPMMULDATASET_H
+#define ACL_TESTS_DATASETS_MATMULLOWPMMULDATASET_H
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "tests/datasets/MatMulDataset.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace datasets
+{
+/** MatMulLowp MMUL shapes are similar to MatMul MMUL shapes except that K has to be a
+ * multiple of MMUL_K0 which is 16 (e.g. see src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp for the definition)
+ */
+class SmallMatMulLowpMMULDataset final : public MatMulDataset
+{
+public:
+    SmallMatMulLowpMMULDataset()
+    {
+        add_config(TensorShape(16U, 4U), TensorShape(4U, 16U), TensorShape(4U, 4U)); // same as mmul block
+        add_config(TensorShape(96U, 1U), TensorShape(1U, 96U), TensorShape(1U, 1U)); // vector x vector
+        add_config(TensorShape(32U, 4U, 2U), TensorShape(16U, 32U, 2U), TensorShape(16U, 4U, 2U));
+        add_config(TensorShape(48U, 2U), TensorShape(17U, 48U), TensorShape(17U, 2U));
+        add_config(TensorShape(32U, 6U), TensorShape(7U, 32U), TensorShape(7U, 6U));
+    }
+};
+
+// This dataset is for smaller number of tests that will still use small shapes
+// e.g. not repeating everything for QASYMM8 while we're already testing for QASYMM8_SIGNED
+class SmallMatMulLowpMMULDatasetSubset final : public MatMulDataset
+{
+public:
+    SmallMatMulLowpMMULDatasetSubset()
+    {
+        add_config(TensorShape(32U, 4U, 2U), TensorShape(16U, 32U, 2U), TensorShape(16U, 4U, 2U));
+        add_config(TensorShape(32U, 6U), TensorShape(7U, 32U), TensorShape(7U, 6U));
+    }
+};
+
+class SmallMatMulLowpMMULWithBiasDataset final : public MatMulDataset
+{
+public:
+    SmallMatMulLowpMMULWithBiasDataset()
+    {
+        add_config(TensorShape(32U, 4U, 2U, 2U), TensorShape(16U, 32U, 2U, 2U), TensorShape(16U, 4U, 2U, 2U));
+    }
+};
+
+class LargeMatMulLowpMMULDataset final : public MatMulDataset
+{
+public:
+    LargeMatMulLowpMMULDataset()
+    {
+        add_config(TensorShape(192U, 38U, 3U, 2U), TensorShape(21U, 192U, 3U, 2U), TensorShape(21U, 38U, 3U, 2U));
+    }
+};
+
+class HighDimensionalMatMulLowpMMULDataset final : public MatMulDataset
+{
+public:
+    HighDimensionalMatMulLowpMMULDataset()
+    {
+        add_config(TensorShape(16U, 5U, 2U, 2U, 2U, 2U), TensorShape(5U, 16U, 2U, 2U, 2U, 2U), TensorShape(5U, 5U, 2U, 2U, 2U, 2U)); // 6D tensor
+    }
+};
+
+} // namespace datasets
+} // namespace test
+} // namespace arm_compute
+
+#endif // ACL_TESTS_DATASETS_MATMULLOWPMMULDATASET_H
diff --git a/tests/validation/CL/MatMulLowpNativeMMULKernel.cpp b/tests/validation/CL/MatMulLowpNativeMMULKernel.cpp
index 10d893e5c4..a361a5af16 100644
--- a/tests/validation/CL/MatMulLowpNativeMMULKernel.cpp
+++ b/tests/validation/CL/MatMulLowpNativeMMULKernel.cpp
@@ -26,8 +26,7 @@
 
 #include "src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h"
 
-#include "tests/datasets/LargeMatMulDataset.h"
-#include "tests/datasets/SmallMatMulDataset.h"
+#include "tests/datasets/MatMulLowpMMULDataset.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
@@ -44,14 +43,27 @@ namespace validation
 {
 namespace
 {
-// TODO: enable
-// constexpr AbsoluteTolerance<float> tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+constexpr AbsoluteTolerance<float> tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
 }
+using framework::dataset::make;
+
 template <typename T>
-using CLMatMulLowpNativeMMULKernelFixture = MatMulKernelValidationFixture<T, ClMatMulLowpNativeMMULKernel>;
+using CLMatMulLowpNativeMMULKernelFixture = MatMulKernelValidationFixture<T, ClMatMulLowpNativeMMULKernel, true /* use_mmul */>;
 
 template <typename T>
-using CLMatMulLowpKernelWithBiasFixture = MatMulKernelWithBiasValidation<T, ClMatMulLowpNativeMMULKernel>;
+using CLMatMulLowpNativeMMULKernelWithBiasFixture = MatMulKernelWithBiasValidation<T, ClMatMulLowpNativeMMULKernel, true /* use_mmul */>;
+
+/** M0 values to test --precommit*/
+const auto m0_values_precommit = framework::dataset::make("M0", { 1, 3 });
+
+/** N0 values to test --precommit*/
+const auto n0_values_precommit = framework::dataset::make("N0", { 2, 4 });
+
+/** M0 values to test --nightly*/
+const auto m0_values_nightly_lhs_nt = framework::dataset::make("M0", { 2, 4, 5, 8 });
+
+/** N0 values to test --nightly*/
+const auto n0_values_nightly_rhs_nt = framework::dataset::make("N0", { 1, 3, 8, 16 });
 
 TEST_SUITE(CL)
 TEST_SUITE(MatMulLowpNativeMMULKernel)
@@ -77,9 +89,10 @@ TEST_CASE(SupportedKernelConfigurations, framework::DatasetMode::ALL)
     for(auto &pair : supported_block_sizes)
     {
         TensorInfo output_info;
-        Status     status = ClMatMulLowpNativeMMULKernel::validate(&lhs_info, &rhs_info, nullptr, &output_info, pair.first);
+        Status     status   = ClMatMulLowpNativeMMULKernel::validate(&lhs_info, &rhs_info, nullptr, &output_info, pair.first);
+        const bool expected = (pair.second && arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()));
 
-        ARM_COMPUTE_EXPECT(bool(status) == pair.second, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
     }
 }
 
@@ -89,21 +102,22 @@ TEST_CASE(ValidateInputShapes, framework::DatasetMode::ALL)
     using ShapeConfigurationTuple = std::tuple<TensorShape, TensorShape, TensorShape, bool>;
     const std::vector<ShapeConfigurationTuple> shape_configurations =
     {
-        { TensorShape(5U, 1U), TensorShape(3U, 5U), TensorShape(3U), true },
-        { TensorShape(10U, 12U), TensorShape(3U, 10U), TensorShape(3U), true },
-        { TensorShape(8U, 4U), TensorShape(2U, 8U), TensorShape(2U), true },
-        { TensorShape(8U, 4U), TensorShape(2U, 5U), TensorShape(2U), false }, // Mismatch in the K dimension
-        { TensorShape(5U, 0U), TensorShape(2U, 5U), TensorShape(2U), false }, // Invalid dimension
-        { TensorShape(5U, 4U, 3U, 4U, 5U, 6U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), TensorShape(2U), true },
-        { TensorShape(5U, 4U, 3U, 4U, 5U, 1U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), TensorShape(2U), false }, // no batch broadcasting
-        { TensorShape(5U, 4U, 3U, 4U, 9U, 6U), TensorShape(2U, 5U, 3U, 4U, 5U, 6U), TensorShape(2U), false }, // mismatch in batch dimension
-        { TensorShape(5U, 1U), TensorShape(3U, 5U), TensorShape(1U), false },                                 // invalid broadcast of bias
-        { TensorShape(5U, 1U), TensorShape(3U, 5U), TensorShape(3U, 3U), false },                             // 2d bias is invalid
+        { TensorShape(32U, 1U), TensorShape(3U, 32U), TensorShape(3U), true },
+        { TensorShape(16U, 12U), TensorShape(3U, 16U), TensorShape(3U), true },
+        { TensorShape(64U, 4U), TensorShape(2U, 64U), TensorShape(2U), true },
+        { TensorShape(16U, 4U), TensorShape(2U, 32U), TensorShape(2U), false }, // Mismatch in the K dimension
+        { TensorShape(16U, 0U), TensorShape(2U, 16U), TensorShape(2U), false }, // Invalid dimension
+        { TensorShape(32U, 4U, 3U, 4U, 5U, 6U), TensorShape(2U, 32U, 3U, 4U, 5U, 6U), TensorShape(2U), true },
+        { TensorShape(32U, 4U, 3U, 4U, 5U, 1U), TensorShape(2U, 32U, 3U, 4U, 5U, 6U), TensorShape(2U), false }, // no batch broadcasting
+        { TensorShape(32U, 4U, 3U, 4U, 9U, 6U), TensorShape(2U, 32U, 3U, 4U, 5U, 6U), TensorShape(2U), false }, // mismatch in batch dimension
+        { TensorShape(32U, 1U), TensorShape(3U, 32U), TensorShape(1U), false },                                 // invalid broadcast of bias
+        { TensorShape(32U, 1U), TensorShape(3U, 32U), TensorShape(3U, 3U), false },                             // 2d bias is invalid
+        { TensorShape(12U, 12U), TensorShape(3U, 12U), TensorShape(3U), false },                                // K must be multiple of 16
     };
 
     for(auto &tuple : shape_configurations)
     {
-        const bool expected = std::get<3>(tuple);
+        const bool expected = (std::get<3>(tuple) && arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()));
 
         for(bool adj_lhs :
             {
@@ -134,7 +148,7 @@ TEST_CASE(ValidateInputShapes, framework::DatasetMode::ALL)
                 const TensorInfo bia_info = TensorInfo(bia_shape, 1, DataType::S32);
                 TensorInfo       output_info;
 
-                MatMulKernelInfo matmul_kernel_info{ adj_lhs, adj_rhs, 1, 1, 1, false /* export_rhs_to_cl_image */ };
+                MatMulKernelInfo matmul_kernel_info{ adj_lhs, adj_rhs, 1, 1, 4, false /* export_rhs_to_cl_image */ };
 
                 Status status = ClMatMulLowpNativeMMULKernel::validate(&lhs_info, &rhs_info, &bia_info, &output_info, matmul_kernel_info);
                 ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
@@ -172,10 +186,10 @@ TEST_CASE(ValidateDataTypes, framework::DatasetMode::ALL)
     // It's enough to test a single shape and block size configuration while checking data types
     const TensorShape      shape     = TensorShape(48U, 48U);
     const TensorShape      bia_shape = TensorShape(48U);
-    const MatMulKernelInfo matmul_kernel_info{ false, false, 1, 1, 1, false };
+    const MatMulKernelInfo matmul_kernel_info{ false, false, 1, 1, 4, false };
     for(auto &tuple : data_type_configurations)
     {
-        const bool expected = std::get<4>(tuple);
+        const bool expected = (std::get<4>(tuple) && arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()));
 
         const TensorInfo lhs_info(shape, 1, std::get<0>(tuple));
         const TensorInfo rhs_info(shape, 1, std::get<1>(tuple));
@@ -183,6 +197,7 @@ TEST_CASE(ValidateDataTypes, framework::DatasetMode::ALL)
         TensorInfo       output_info(shape, 1, std::get<3>(tuple));
 
         Status status = ClMatMulLowpNativeMMULKernel::validate(&lhs_info, &rhs_info, &bia_info, &output_info, matmul_kernel_info);
+
         ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
     }
 }
@@ -192,12 +207,137 @@ TEST_SUITE_END() // Validate
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8_SIGNED)
 
-// TODO: tests
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulLowpNativeMMULKernelFixture<int8_t>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::SmallMatMulLowpMMULDataset(),
+                               make("TransposeA", { false }),
+                               make("TransposeB", { false }),
+                               m0_values_precommit,
+                               n0_values_precommit,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunWithBias, CLMatMulLowpNativeMMULKernelWithBiasFixture<int8_t>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::SmallMatMulLowpMMULWithBiasDataset(),
+                               make("TransposeA", { false }),
+                               make("TransposeB", { false }),
+                               m0_values_precommit,
+                               n0_values_precommit,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeNoTranspose, CLMatMulLowpNativeMMULKernelFixture<int8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulLowpMMULDataset(),
+                               make("TransposeA", { false }),
+                               make("TransposeB", { false }),
+                               m0_values_nightly_lhs_nt,
+                               n0_values_nightly_rhs_nt,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+// Running High Dimensional test is enough for qasymm8_signed, because we're stressing the number of dimensions, not data type or M0/N0/K0
+// It's a good idea to test for each Lhs/Rhs T/NT combinations because they're different CL kernels
+FIXTURE_DATA_TEST_CASE(RunHighDimensional, CLMatMulLowpNativeMMULKernelFixture<int8_t>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::HighDimensionalMatMulLowpMMULDataset(),
+                               make("TransposeA", { false }),
+                               make("TransposeB", { false }),
+                               make("M0", { 2 }),
+                               make("N0", { 2 }),
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8_SIGNED)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
 
 TEST_SUITE_END() // QASYMM8_SIGNED
+
 TEST_SUITE(QASYMM8)
 
-// TODO: tests
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulLowpNativeMMULKernelFixture<uint8_t>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::SmallMatMulLowpMMULDatasetSubset(),
+                               make("TransposeA", { false }),
+                               make("TransposeB", { false }),
+                               m0_values_precommit,
+                               n0_values_precommit,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunWithBias, CLMatMulLowpNativeMMULKernelWithBiasFixture<uint8_t>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::SmallMatMulLowpMMULWithBiasDataset(),
+                               make("TransposeA", { false }),
+                               make("TransposeB", { false }),
+                               m0_values_precommit,
+                               n0_values_precommit,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeNoTranspose, CLMatMulLowpNativeMMULKernelFixture<uint8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeMatMulLowpMMULDataset(),
+                               make("TransposeA", { false }),
+                               make("TransposeB", { false }),
+                               m0_values_nightly_lhs_nt,
+                               n0_values_nightly_rhs_nt,
+                               make("K0", { 4 }),
+                               make("ExportRhsToCLImage", { false }),
+                               make("DataType", DataType::QASYMM8)))
+{
+    if(_device_supports_mmul)
+    {
+        // Validate output
+        validate(CLAccessor(_target), _reference, tolerance_quant);
+    }
+}
 
 TEST_SUITE_END() // QASYMM8
 TEST_SUITE_END() // Quantized
author	Gunes Bayir <gunes.bayir@arm.com>	2023-09-13 11:59:34 +0100
committer	Gunes Bayir <gunes.bayir@arm.com>	2023-09-18 13:51:15 +0000
commit	a116cd3676796412cd4d9318a6cc1c1eef4c093c (patch)
tree	21788d6776e7a0808d0f6d6c1bef452cfb2c7f27 /tests
parent	40a9d3ea62d7dfed3fb42b5bc5c2ee5272fd89bf (diff)
download	ComputeLibrary-a116cd3676796412cd4d9318a6cc1c1eef4c093c.tar.gz