1 files changed, 104 insertions, 25 deletions
diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
index 7a62186677..2622274587 100644
--- a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
+++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,23 +23,15 @@
  */
 #include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
 
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Log.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/kernels/ClCastKernel.h"
 #include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h"
 #include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
 #include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h"
 #include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h"
 #include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
@@ -47,9 +39,6 @@
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
 #include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
 
-#include "src/common/utils/Log.h"
-#include "utils/TypePrinter.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -67,6 +56,7 @@ inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
     {
         case CLGEMMKernelType::NATIVE:
         case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
         {
             return true;
         }
@@ -165,6 +155,41 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs
     return true;
 }
 
+// Validate lhs_info and rhs_info for reshaped only rhs kernel
+inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output,
+                                                         unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d)
+{
+    // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
+    TensorInfo tmp_b_info{};
+    // Validate reshape RHS kernel
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    {
+        return false;
+    }
+    // Validate mm kernel
+    // NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info
+    // NOTE: This assumes:
+    //  1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments).
+    //  2. lhs and rhs info does not cause window and padding issues through side effects (in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window).
+    GEMMKernelInfo gemm_kernel_info;
+    gemm_kernel_info.m                       = m;
+    gemm_kernel_info.n                       = n;
+    gemm_kernel_info.k                       = k;
+    gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    gemm_kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    gemm_kernel_info.lhs_info                = lhs_info;
+    gemm_kernel_info.rhs_info                = rhs_info;
+    // Since we ignore the output stage, output data type has to be S32 to pass the validation
+    TensorInfo output_info_copy(*output);
+    output_info_copy.set_data_type(DataType::S32);
+    if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info)))
+    {
+        return false;
+    }
+    return true;
+}
+
 // Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,
                                                                                           const ITensorInfo *a,
@@ -184,6 +209,19 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped
     return { config.lhs_info, config.rhs_info };
 }
 
+// Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,
+                                                                                               const ITensorInfo *a,
+                                                                                               const ITensorInfo *b, const ITensorInfo *output)
+{
+    ARM_COMPUTE_UNUSED(a, b, output, reinterpret_input_as_3d, depth_output_gemm3d);
+    auto config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
+    validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d);
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(),
+                                              to_string(config.rhs_info).c_str());
+    return { config.lhs_info, config.rhs_info };
+}
+
 inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type)
 {
     switch(kernel_type)
@@ -191,6 +229,7 @@ inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type)
         case CLGEMMKernelType::NATIVE:
             return false;
         case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
             return true;
         default:
             ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!");
@@ -202,6 +241,7 @@ ClGemmLowpMatrixMultiplyCore::ClGemmLowpMatrixMultiplyCore()
     : _weights_to_qasymm8(std::make_unique<ClCastKernel>()),
       _mm_native_kernel(std::make_unique<ClGemmLowpMatrixMultiplyNativeKernel>()),
       _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>()),
+      _mm_reshaped_only_rhs_mmul_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel>()),
       _mtx_b_reshape_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()),
       _mtx_a_reduction_kernel(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
       _mtx_b_reduction_kernel(std::make_unique<ClGemmLowpMatrixBReductionKernel>()),
@@ -218,7 +258,7 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
                                              const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-    ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c != nullptr ? c : nullptr, output, gemm_info));
+    ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c, output, gemm_info));
     ARM_COMPUTE_LOG_PARAMS(a, b, c, output, gemm_info);
 
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
@@ -234,6 +274,7 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
     // Set the target for the kernels
     _mm_native_kernel->set_target(gpu_target);
     _mm_reshaped_only_rhs_kernel->set_target(gpu_target);
+    _mm_reshaped_only_rhs_mmul_kernel->set_target(gpu_target);
 
     GEMMRHSMatrixInfo rhs_info;
     GEMMLHSMatrixInfo lhs_info;
@@ -249,8 +290,7 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
 
     const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
 
-    // Check if we need to reshape the matrix A and matrix B
-    _is_gemm_reshaped = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run));
+    _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run);
 
     if(_convert_to_qasymm8)
     {
@@ -261,7 +301,7 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
     }
 
     ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
-    if(_is_gemm_reshaped)
+    if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
     {
         matrix_b = &_tmp_b;
 
@@ -274,6 +314,19 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
         // Configure reshape RHS kernel
         _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
     }
+    if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+    {
+        matrix_b = &_tmp_b;
+
+        // Pick up the GEMM configuration
+        // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d,
+                                                                                      depth_output_gemm3d,
+                                                                                      a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
+
+        // Configure reshape RHS kernel
+        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
+    }
 
     // Using default reduction info
     const GEMMLowpReductionKernelInfo reduction_info {};
@@ -326,20 +379,30 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
 
         gemm_kernel_info.output_stage = gemmlowp_output_stage;
 
-        if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             // Configure and tune matrix multiply kernel with fused output stage
             _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
                                                     _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
         }
+        else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        {
+            // Configure and tune matrix multiply kernel with fused output stage
+            _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                                                         _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+        }
         else
         {
             _run_output_stage = true;
 
-            if(_is_gemm_reshaped)
+            if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
             {
                 _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);
             }
+            if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+            {
+                _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);
+            }
             else
             {
                 // Pick up the GEMM configuration
@@ -359,11 +422,16 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
     else
     {
         _run_offset_contribution = true;
-        if(_is_gemm_reshaped)
+        if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
         {
             // Configure and tune matrix multiply kernel
             _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
         }
+        else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+        {
+            // Configure and tune matrix multiply kernel
+            _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
+        }
         else
         {
             // Pick up the GEMM configuration
@@ -382,7 +450,7 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
 
     // Request memory
     _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
-    if(_is_gemm_reshaped)
+    if(is_gemm_reshaped(_gemm_kernel_type))
     {
         // Overwrite Rhs as prepare if gemm is reshaped as there will be a two-step transformation
         _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
@@ -607,7 +675,7 @@ void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
     const ITensor *matrix_a = a;
     const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.get() : b;
 
-    if(_is_gemm_reshaped)
+    if(is_gemm_reshaped(_gemm_kernel_type))
     {
         matrix_b = tmp_b.get();
         if(!_reshape_b_only_on_first_run)
@@ -645,7 +713,7 @@ void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
     }
 
     // Run matrix multiply
-    if(_is_gemm_reshaped)
+    if(is_gemm_reshaped(_gemm_kernel_type))
     {
         ITensorPack gemm_reshaped_pack;
         if(_run_offset_contribution)
@@ -669,7 +737,18 @@ void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
                 { TensorType::ACL_DST, dst },
             });
         }
-        CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false);
+        if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+        {
+            CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false);
+        }
+        else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+        {
+            CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_mmul_kernel, gemm_reshaped_pack, false);
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("Invalid reshaped kernel");
+        }
     }
     else
     {
@@ -728,7 +807,7 @@ void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
             b->mark_as_unused();
         }
 
-        if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
+        if(is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run)
         {
             // Run reshape kernel and mark original weights tensor as unused
             ITensorPack mtx_b_pack =