Connect CLMatMul function to quantized kernels and resolve NE BatchMatMul int_8 failures

* Adapt the CLMatMul function and ClMatMul operator to use quantized kernels. * Add function-level tests. Resolves: COMPMID-5929 and COMPMID-5811 Change-Id: I5348cdcf07b8074c138e04dfef0a73399377accd Signed-off-by: Jakub Sujak <jakub.sujak@arm.com> Signed-off-by: Omar Al Khatib <omar.alkhatib@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9575 Reviewed-by: Mohmun02 <MohammedSuhail.Munshi@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
author: Jakub Sujak <jakub.sujak@arm.com> 2023-04-17 12:08:48 +0100
committer: Omar Al Khatib <omar.alkhatib@arm.com> 2023-05-05 14:48:28 +0000
commit: e9b3ee2badebf91188c1cd0e59d6aaa30ed60985 (patch)
tree: 750c39df7c0113caf6a893bb6af6e9ef1ecc3756
parent: edafe7f5fdc056fddc395c70420fc869dcb7d9fb (diff)
download: ComputeLibrary-e9b3ee2badebf91188c1cd0e59d6aaa30ed60985.tar.gz
11 files changed, 255 insertions, 114 deletions
diff --git a/arm_compute/runtime/CL/functions/CLMatMul.h b/arm_compute/runtime/CL/functions/CLMatMul.h
index 712bac06bf..2af9a4a9a6 100644
--- a/arm_compute/runtime/CL/functions/CLMatMul.h
+++ b/arm_compute/runtime/CL/functions/CLMatMul.h
@@ -21,11 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL
-#define ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL
 
 #include "arm_compute/runtime/IFunction.h"
 #include <memory>
+
 namespace arm_compute
 {
 // Forward declarations for used types instead of including their header, that could minimize compile time
@@ -64,10 +65,12 @@ public:
      * - All
      *
      * Valid data type configurations:
-     * |lhs          |rhs          |output         |
-     * |:------------|:------------|:--------------|
-     * |F32          |F32          |F32            |
-     * |F16          |F16          |F16            |
+     * |lhs            |rhs            |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F32            |F32            |F32            |
+     * |F16            |F16            |F16            |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
      *
      * @note BatchMatMul: Batched Matrix Multiply - [A * B], Multiplies all slices (slice is an element of a batch) of Tensors A and B
      *                    and stores the result in the dst tensor of the same batch size.
@@ -76,18 +79,18 @@ public:
      * @note All tensors must have the same data type.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  lhs             LHS input tensor (Matrix or Vector A). Data types supported: F16/F32
-     * @param[in]  rhs             RHS input tensor (Matrix B). Data type supported: same as @p lhs.
-     * @param[out] output          Output tensor. Data type supported: same as @p lhs.
-     * @param[in]  matmul_info     Attributes for MatMul
+     * @param[in]  lhs             Left-hand side tensor info containing the input activations as Matrix A. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
+     * @param[in]  rhs             Right-hand side tensor info containing the input weights as Matrix B. Data types supported: same as @p lhs.
+     * @param[out] dst             Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs.
+     * @param[in]  matmul_info     Contains MatMul operation information described in @ref MatMulInfo.
      * @param[in]  settings        Class containing flags for function level settings
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *rhs, ICLTensor *lhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{});
+    void configure(const CLCompileContext &compile_context, ICLTensor *rhs, ICLTensor *lhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{});
     /** Initialise the kernel's inputs and output
      *
      * Similar to @ref CLMatMul::configure()
      */
-    void configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{});
+    void configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{});
     /** Static function to check if given info will lead to a valid configuration of @ref CLMatMul.
      *
      * Similar to @ref CLMatMul::configure()
@@ -104,4 +107,4 @@ private:
 };
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL */
+#endif /* ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL */
diff --git a/arm_compute/runtime/NEON/functions/NEMatMul.h b/arm_compute/runtime/NEON/functions/NEMatMul.h
index 0f3e3adacc..a331c55a98 100644
--- a/arm_compute/runtime/NEON/functions/NEMatMul.h
+++ b/arm_compute/runtime/NEON/functions/NEMatMul.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL
-#define ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL
 
 #include "arm_compute/runtime/IFunction.h"
 #include <memory>
@@ -80,25 +80,27 @@ public:
      * - Any
      *
      * Valid data type configurations:
-     * |src0           |src1               |dst            |
+     * |lhs            |rhs                |dst            |
      * |:--------------|:------------------|:--------------|
      * |F32            |F32                |F32            |
      * |F16            |F16                |F16            |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |QASYMM8_SIGNED |
+     * |QASYMM8        |QASYMM8            |QASYMM8        |
      *
-     * @param[in]  lhs      Input source tensor.
-     * @param[in]  rhs      Input source tensor.
-     * @param[out] output   Output tensor. Data type supported: same as @p lhs/rhs
-     * @param[in]  info     Class containing flags to transpose lhs/rhs
+     * @param[in]  lhs      Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
+     * @param[in]  rhs      Right-hand side tensor info. Data types supported: same as @p lhs.
+     * @param[out] dst      Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs.
+     * @param[in]  info     Contains MatMul operation information described in @ref MatMulInfo.
      * @param[in]  settings Class containing flags for function level settings i.e fast math
      */
-    void configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatMulInfo &info, const CpuMatMulSettings &settings);
+    void configure(ITensor *lhs, ITensor *rhs, ITensor *dst, const MatMulInfo &info, const CpuMatMulSettings &settings);
     /** Static function to check if given info will lead to a valid configuration of @ref NEMatMul
      *
      * Parameters are similar to @ref NEMatMul::configure()
      *
      * @return Status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &info, const CpuMatMulSettings &settings);
+    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings);
 
     // Inherited methods overridden
     void run() override;
@@ -108,4 +110,4 @@ private:
     std::unique_ptr<Impl> _impl;
 };
 }
-#endif /* ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL */
+#endif /* ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL */
diff --git a/src/cpu/operators/CpuMatMul.cpp b/src/cpu/operators/CpuMatMul.cpp
index 64b5167ad0..87cb6c6b54 100644
--- a/src/cpu/operators/CpuMatMul.cpp
+++ b/src/cpu/operators/CpuMatMul.cpp
@@ -191,6 +191,7 @@ void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst,
     // Fill AsmGemmInfo class object before configuration
     _gemm_info.activation_info = info.fused_activation();
     _gemm_info.fast_mode       = settings.fast_math();
+    _gemm_info.negated_offsets = false;
 
     lhs_to_use = (_adj_lhs) ? _lhs_transposed : lhs_to_use;
     rhs_to_use = (_adj_rhs) ? _rhs_transposed : rhs_to_use;
diff --git a/src/cpu/operators/CpuMatMul.h b/src/cpu/operators/CpuMatMul.h
index ae6345141e..9f5833b24f 100644
--- a/src/cpu/operators/CpuMatMul.h
+++ b/src/cpu/operators/CpuMatMul.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CPU_OPERATORS_CPUMATMUL
-#define SRC_CPU_OPERATORS_CPUMATMUL
+#ifndef ACL_SRC_CPU_OPERATORS_CPUMATMUL
+#define ACL_SRC_CPU_OPERATORS_CPUMATMUL
 
 #include "arm_compute/core/TensorInfo.h"
 #include "src/core/common/Macros.h"
@@ -59,9 +59,9 @@ public:
      * Note: Check documentation of @ref NEMatMul for a list of supported datatypes and layouts
      *
      *
-     * @param[in]  lhs      Source tensor info.
-     * @param[in]  rhs      Source tensor info.
-     * @param[out] dst      Destination tensor info. Data types supported: same as @p lhs / @p rhs.
+     * @param[in]  lhs      Left-hand side tensor info.
+     * @param[in]  rhs      Right-hand side tensor info.
+     * @param[out] dst      Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs.
      * @param[in]  info     Contains MatMul operation information described in @ref MatMulInfo.
      * @param[in]  settings The settings for matmul operation (i.e fast math)
      */
@@ -112,4 +112,4 @@ private:
 }
 }
 
-#endif /* SRC_CPU_OPERATORS_CPUMATMUL */
+#endif /* ACL_SRC_CPU_OPERATORS_CPUMATMUL */
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
index 13a33fbd62..d70ff30b91 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
@@ -48,17 +48,17 @@ public:
      *                             Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
      * @param[in]  rhs             Input tensor for the RHS matrix. Data type supported: same as @p lhs.
      *                             Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
-     * @param[out] output          Output tensor info. Data type supported: same as @p lhs
+     * @param[out] dst             Output tensor info. Data type supported: same as @p lhs
      * @param[in]  matmul_info     Attributes for Batch MatMul Kernel
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulKernelInfo &matmul_info);
+    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClMatMulLowpNativeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulKernelInfo &matmul_info);
+    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
index 47dba22e8f..8f53c1998f 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
@@ -119,35 +119,36 @@ ClMatMulNativeKernel::ClMatMulNativeKernel()
 {
     _type = CLKernelType::GEMM;
 }
-Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info)
+
+Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_export_to_cl_image(rhs, matmul_kernel_info));
 
-    if(output->total_size() != 0)
+    if(dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, output);
+        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
     }
 
     return Status{};
 }
-void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info)
+void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output, &compile_context, &matmul_kernel_info);
-    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output, matmul_kernel_info);
-    ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, output, matmul_kernel_info));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_kernel_info);
+    ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, dst, matmul_kernel_info));
 
-    // output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+    // dst tensor auto initialization if not yet initialized
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
 
-    const int  m       = output->dimension(1);
-    const int  n       = output->dimension(0);
+    const int  m       = dst->dimension(1);
+    const int  n       = dst->dimension(0);
     const int  k       = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
     const bool adj_lhs = matmul_kernel_info.adj_lhs;
 
@@ -157,7 +158,7 @@ void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, IT
     _export_rhs_to_cl_image = matmul_kernel_info.export_rhs_to_cl_image && !rhs->lock_paddings();
 
     // Configure kernel window
-    Window win = calculate_max_window(*output, Steps(n0, m0));
+    Window win = calculate_max_window(*dst, Steps(n0, m0));
     win        = win.collapse(win, Window::DimZ);
     IClKernel::configure_internal(win);
 
@@ -201,7 +202,7 @@ void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, IT
     _config_id += "_";
     _config_id += support::cpp11::to_string(k);
     _config_id += "_";
-    _config_id += support::cpp11::to_string(output->dimension(2));
+    _config_id += support::cpp11::to_string(dst->dimension(2));
     _config_id += "_";
     _config_id += support::cpp11::to_string(_export_rhs_to_cl_image);
     _config_id += "_";
@@ -219,9 +220,9 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl
 
     const ICLTensor *lhs    = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
     const ICLTensor *rhs    = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    ICLTensor       *output = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output);
-    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output);
+    ICLTensor       *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst);
 
     unsigned int idx              = 0;
     Window       window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
@@ -242,7 +243,7 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl
     }
 
     add_3d_tensor_nhw_argument(idx, rhs);
-    add_3d_tensor_nhw_argument(idx, output);
+    add_3d_tensor_nhw_argument(idx, dst);
 
     enqueue(queue, *this, window_collapsed, lws_hint());
 }
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.h b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
index 50aa3b70e4..f706256e31 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
@@ -47,17 +47,17 @@ public:
      *                             Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
      * @param[in]  rhs             Input tensor for the RHS matrix. Data type supported: same as @p lhs.
      *                             Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
-     * @param[out] output          Output tensor info. Data type supported: same as @p lhs
+     * @param[out] dst             Output tensor info. Data type supported: same as @p lhs
      * @param[in]  matmul_info     Attributes for Batch MatMul Kernel
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulKernelInfo &matmul_info);
+    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClMatMulNativeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulKernelInfo &matmul_info);
+    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp
index 15833216bb..3822c16aa1 100644
--- a/src/gpu/cl/operators/ClMatMul.cpp
+++ b/src/gpu/cl/operators/ClMatMul.cpp
@@ -22,8 +22,11 @@
  * SOFTWARE.
  */
 #include "src/gpu/cl/operators/ClMatMul.h"
+
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
 #include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h"
@@ -37,45 +40,74 @@ namespace arm_compute
 namespace opencl
 {
 using namespace arm_compute::opencl::kernels;
+
 ClMatMul::ClMatMul()
-    : _native_matmul_kernel(std::make_unique<ClMatMulNativeKernel>())
+    : _matmul_native_kernel(std::make_unique<ClMatMulNativeKernel>()),
+      _matmul_lowp_native_kernel(std::make_unique<ClMatMulLowpNativeKernel>())
 {
 }
-ClMatMul::~ClMatMul()
-{
-}
-Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info)
+
+Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+
     const GPUTarget gpu_target = CLScheduler::get().target();
 
     std::unique_ptr<IClMatMulNativeKernelConfig> t = ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
 
     MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
 
-    return ClMatMulNativeKernel::validate(lhs, rhs, output, kernel_info);
+    bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type());
+
+    return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, dst, kernel_info) :
+                          ClMatMulNativeKernel::validate(lhs, rhs, dst, kernel_info);
 }
-void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulInfo &matmul_info)
+
+void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output);
-    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output, matmul_info);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_info);
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, output, matmul_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, dst, matmul_info));
+
+    _is_quantized = is_data_type_quantized_asymmetric(lhs->data_type());
+
     const GPUTarget gpu_target = CLScheduler::get().target();
 
     std::unique_ptr<IClMatMulNativeKernelConfig> t = ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
 
     MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
 
-    // Set the target for the kernels
-    _native_matmul_kernel->set_target(gpu_target);
+    if(_is_quantized)
+    {
+        _matmul_lowp_native_kernel->set_target(gpu_target);
 
-    // Configure the native matrix multiply kernel
-    _native_matmul_kernel->configure(compile_context, lhs, rhs, output, kernel_info);
+        // Configure the low-precision native matrix multiply kernel
+        _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info);
+    }
+    else
+    {
+        _matmul_native_kernel->set_target(gpu_target);
+
+        // Configure the native matrix multiply kernel
+        _matmul_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info);
+    }
 }
+
 void ClMatMul::run(ITensorPack &tensors)
 {
-    CLScheduler::get().enqueue_op(*_native_matmul_kernel, tensors, true);
+    if(_is_quantized)
+    {
+        CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, tensors, true);
+    }
+    else
+    {
+        CLScheduler::get().enqueue_op(*_matmul_native_kernel, tensors, true);
+    }
 }
+
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClMatMul.h b/src/gpu/cl/operators/ClMatMul.h
index 20beda91ce..3d9863266e 100644
--- a/src/gpu/cl/operators/ClMatMul.h
+++ b/src/gpu/cl/operators/ClMatMul.h
@@ -21,11 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_SRC_GPU_CL_OPERATORS_ClMatMul
-#define ARM_COMPUTE_SRC_GPU_CL_OPERATORS_ClMatMul
+#ifndef ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL
+#define ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL
 
 #include "src/gpu/cl/IClOperator.h"
 #include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -41,17 +43,20 @@ class ClMatMul : public IClOperator
 public:
     /** Constructor */
     ClMatMul();
-    ~ClMatMul();
+    /** Default destructor */
+    ~ClMatMul() = default;
     /** Initialise the kernel's inputs and output
      *
      * Valid data layouts:
      * - All
      *
      * Valid data type configurations:
-     * |lhs          |rhs          |output       |
-     * |:------------|:------------|:------------|
-     * |F32          |F32          |F32          |
-     * |F16          |F16          |F16          |
+     * |lhs            |rhs            |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F32            |F32            |F32            |
+     * |F16            |F16            |F16            |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
      *
      * @note BatchMatMul: Batched Matrix Multiply - [A * B], Multiplies all slices (slice is an element of a batch) of Tensors A and B
      *                    and stores the result in the dst tensor of the same batch size.
@@ -60,25 +65,28 @@ public:
      * @note All tensors must have the same data type.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  lhs             LHS input tensor info (Matrix A). Data types supported: F16/F32
-     * @param[in]  rhs             RHS input tensor info (Matrix B). Data types supported: same as @p lhs.
-     * @param[out] output          Output tensor info. Data types supported: same as @p lhs
-     * @param[in]  matmul_info     Attributes for MatMul
+     * @param[in]  lhs             Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
+     * @param[in]  rhs             Right-hand side tensor info. Data types supported: same as @p lhs.
+     * @param[out] dst             Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs.
+     * @param[in]  matmul_info     Contains MatMul operation information described in @ref MatMulInfo.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulInfo &matmul_info);
+    void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClMatMul::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info);
+    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info);
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
 
 private:
-    std::unique_ptr<kernels::ClMatMulNativeKernel> _native_matmul_kernel;
+    std::unique_ptr<kernels::ClMatMulNativeKernel>     _matmul_native_kernel{nullptr};
+    std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{nullptr};
+
+    bool _is_quantized{ false };
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif // ARM_COMPUTE_SRC_GPU_CL_OPERATORS_ClMatMul
+#endif /* ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL */
diff --git a/tests/validation/CL/MatMul.cpp b/tests/validation/CL/MatMul.cpp
index 7c1d16008f..6364b16200 100644
--- a/tests/validation/CL/MatMul.cpp
+++ b/tests/validation/CL/MatMul.cpp
@@ -21,14 +21,19 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLMatMul.h"
+
 #include "tests/CL/CLAccessor.h"
-#include "tests/datasets/LargeMatMulDataset.h"
-#include "tests/datasets/SmallMatMulDataset.h"
+#include "tests/framework/DatasetModes.h"
 #include "tests/framework/Macros.h"
+#include "tests/framework/TestCase.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
+
+#include "tests/datasets/LargeMatMulDataset.h"
+#include "tests/datasets/SmallMatMulDataset.h"
 #include "tests/validation/fixtures/MatMulFixture.h"
 
 namespace arm_compute
@@ -39,55 +44,143 @@ namespace validation
 {
 namespace
 {
-RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for fp32 data type */
-constexpr float          abs_tolerance_f32(
-    0.0001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp32 data type in case using relative tolerance fails because of small values */
-constexpr float abs_tolerance_f16(
-    0.001f);                                                   /**< Absolute tolerance value for comparing reference's output against implementation's output for fp16  data type in case using relative tolerance fails because of small values */
-RelativeTolerance<half_float::half> tolerance_f16(half(0.01)); /**< Tolerance value for comparing reference's output against implementation's output for fp16 data type */
+RelativeTolerance<float>             tolerance_f32(0.001f);      /**< Tolerance value for comparing reference's output against implementation's output for fp32 data type */
+constexpr float                      abs_tolerance_f32(0.0001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp32 data type in case using relative tolerance fails because of small values */
+constexpr float                      abs_tolerance_f16(0.001f);  /**< Absolute tolerance value for comparing reference's output against implementation's output for fp16  data type in case using relative tolerance fails because of small values */
+RelativeTolerance<half_float::half>  tolerance_f16(half(0.01));  /**< Tolerance value for comparing reference's output against implementation's output for fp16 data type */
+constexpr AbsoluteTolerance<uint8_t> tolerance_quant(1);         /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
 } // namespace
 
 template <typename T>
-using MatMulFixture = MatMulValidationFixture<CLTensor, CLAccessor, CLMatMul, GpuMatMulSettings, T>;
+using CLMatMulFixture = MatMulValidationFixture<CLTensor, CLAccessor, CLMatMul, GpuMatMulSettings, T>;
+
+template <typename T>
+using CLQuantizedMatMulFixture = QuantizedMatMulValidationFixture<CLTensor, CLAccessor, CLMatMul, GpuMatMulSettings, T>;
 
 TEST_SUITE(CL)
 TEST_SUITE(MatMul)
+
+TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, MatMulFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallMatMulDataset(),
-                                                                                                                  framework::dataset::make("pretransose_A", { false, true })),
-                                                                                                          framework::dataset::make("pretransose_B", { false, true })),
-                                                                                                  framework::dataset::make("DataType", DataType::F32)))
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallMatMulDataset(),
+                                                                                                                    framework::dataset::make("TransposeA", { false, true })),
+                                                                                                            framework::dataset::make("TransposeB", { false, true })),
+                                                                                                    framework::dataset::make("DataType", DataType::F32)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, MatMulFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeMatMulDataset(),
-                                                                                                                framework::dataset::make("pretransose_A", { false, true })),
-                                                                                                        framework::dataset::make("pretransose_B", { false, true })),
-                                                                                                framework::dataset::make("DataType", DataType::F32)))
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLMatMulFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                                                                  framework::dataset::make("TransposeA", { false, true })),
+                                                                                                          framework::dataset::make("TransposeB", { false, true })),
+                                                                                                  framework::dataset::make("DataType", DataType::F32)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
 }
+
 TEST_SUITE_END() // FP32
+
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, MatMulFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallMatMulDataset(),
-                                                                                                                 framework::dataset::make("pretransose_A", { false, true })),
-                                                                                                         framework::dataset::make("pretransose_B", { false, true })),
-                                                                                                 framework::dataset::make("DataType", DataType::F16)))
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallMatMulDataset(),
+                                                                                                                   framework::dataset::make("TransposeA", { false, true })),
+                                                                                                           framework::dataset::make("TransposeB", { false, true })),
+                                                                                                   framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, MatMulFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeMatMulDataset(),
-                                                                                                               framework::dataset::make("pretransose_A", { false, true })),
-                                                                                                       framework::dataset::make("pretransose_B", { false, true })),
-                                                                                               framework::dataset::make("DataType", DataType::F16)))
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLMatMulFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeMatMulDataset(),
+                                                                                                                 framework::dataset::make("TransposeA", { false, true })),
+                                                                                                         framework::dataset::make("TransposeB", { false, true })),
+                                                                                                 framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
 }
+
 TEST_SUITE_END() // FP16
+TEST_SUITE_END() // Float
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLQuantizedMatMulFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine(
+    datasets::SmallMatMulDataset(),
+    framework::dataset::make("TransposeA", { false, true })),
+    framework::dataset::make("TransposeB", { false, true })),
+    framework::dataset::make("DataType", DataType::QASYMM8)),
+    framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })),
+    framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
+    framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) })),
+    framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) })),
+    framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) }))
+)
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLQuantizedMatMulFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(
+    datasets::LargeMatMulDataset(),
+    framework::dataset::make("TransposeA", { false, true })),
+    framework::dataset::make("TransposeB", { false, true })),
+    framework::dataset::make("DataType", DataType::QASYMM8)),
+    framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })),
+    framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
+    framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })),
+    framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })),
+    framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) }))
+)
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLQuantizedMatMulFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine(
+    datasets::SmallMatMulDataset(),
+    framework::dataset::make("TransposeA", { false, true })),
+    framework::dataset::make("TransposeB", { false, true })),
+    framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+    framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })),
+    framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
+    framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) })),
+    framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) })),
+    framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) }))
+)
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLQuantizedMatMulFixture<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(
+    datasets::LargeMatMulDataset(),
+    framework::dataset::make("TransposeA", { false, true })),
+    framework::dataset::make("TransposeB", { false, true })),
+    framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+    framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })),
+    framework::dataset::make("NumberOfExtraRuns", { 0, 1 })),
+    framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })),
+    framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })),
+    framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) }))
+)
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+
+TEST_SUITE_END() // QASYMM8_SIGNED
+
+TEST_SUITE_END() // Quantized
+
 TEST_SUITE_END() // MatMul
 TEST_SUITE_END() // CL
 } // namespace validation
diff --git a/tests/validation/fixtures/MatMulFixture.h b/tests/validation/fixtures/MatMulFixture.h
index 15719024b1..2f94c1f9d2 100644
--- a/tests/validation/fixtures/MatMulFixture.h
+++ b/tests/validation/fixtures/MatMulFixture.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef TESTS_VALIDATION_FIXTURES_MATMULFIXTURE
-#define TESTS_VALIDATION_FIXTURES_MATMULFIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_MATMULFIXTURE
+#define ACL_TESTS_VALIDATION_FIXTURES_MATMULFIXTURE
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
@@ -188,8 +188,9 @@ protected:
         std::vector<int32_t> output_multipliers{ output_multiplier };
         std::vector<int32_t> output_shifts{ output_shift };
 
+        //The lhs and rhs offsets are negated here to keep the reference aligned with the function implementation where the lhs and rhs offsets are also negated.
         const auto tmp = reference::gemmlowp_matrix_multiply_core<int32_t>(
-            a, b, c.shape(), aq.offset, bq.offset);
+            a, b, c.shape(), -aq.offset, -bq.offset);
 
         auto output = reference::gemmlowp_quantize_down_scale_by_fixedpoint<int32_t, TT>(
             tmp, output_multipliers, output_shifts, oq.offset,
@@ -314,4 +315,4 @@ public:
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_MATMUL_FIXTURE */
+#endif /* ACL_TESTS_VALIDATION_FIXTURES_MATMULFIXTURE */
author	Jakub Sujak <jakub.sujak@arm.com>	2023-04-17 12:08:48 +0100
committer	Omar Al Khatib <omar.alkhatib@arm.com>	2023-05-05 14:48:28 +0000
commit	e9b3ee2badebf91188c1cd0e59d6aaa30ed60985 (patch)
tree	750c39df7c0113caf6a893bb6af6e9ef1ecc3756
parent	edafe7f5fdc056fddc395c70420fc869dcb7d9fb (diff)
download	ComputeLibrary-e9b3ee2badebf91188c1cd0e59d6aaa30ed60985.tar.gz