aboutsummaryrefslogtreecommitdiff
path: root/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp')
-rw-r--r--src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp129
1 files changed, 104 insertions, 25 deletions
diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
index 7a62186677..2622274587 100644
--- a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
+++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,23 +23,15 @@
*/
#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/core/Log.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/MemoryHelpers.h"
#include "src/gpu/cl/kernels/ClCastKernel.h"
#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h"
#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
#include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h"
#include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h"
#include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
@@ -47,9 +39,6 @@
#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
#include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
-#include "src/common/utils/Log.h"
-#include "utils/TypePrinter.h"
-
namespace arm_compute
{
namespace opencl
@@ -67,6 +56,7 @@ inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
{
case CLGEMMKernelType::NATIVE:
case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+ case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
{
return true;
}
@@ -165,6 +155,41 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs
return true;
}
+// Validate lhs_info and rhs_info for reshaped only rhs kernel
+inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output,
+ unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d)
+{
+ // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
+ TensorInfo tmp_b_info{};
+ // Validate reshape RHS kernel
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+ {
+ return false;
+ }
+ // Validate mm kernel
+ // NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info
+ // NOTE: This assumes:
+ // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments).
+ // 2. lhs and rhs info does not cause window and padding issues through side effects (in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window).
+ GEMMKernelInfo gemm_kernel_info;
+ gemm_kernel_info.m = m;
+ gemm_kernel_info.n = n;
+ gemm_kernel_info.k = k;
+ gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+ gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
+ gemm_kernel_info.lhs_info = lhs_info;
+ gemm_kernel_info.rhs_info = rhs_info;
+ // Since we ignore the output stage, output data type has to be S32 to pass the validation
+ TensorInfo output_info_copy(*output);
+ output_info_copy.set_data_type(DataType::S32);
+ if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info)))
+ {
+ return false;
+ }
+ return true;
+}
+
// Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,
const ITensorInfo *a,
@@ -184,6 +209,19 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped
return { config.lhs_info, config.rhs_info };
}
+// Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,
+ const ITensorInfo *a,
+ const ITensorInfo *b, const ITensorInfo *output)
+{
+ ARM_COMPUTE_UNUSED(a, b, output, reinterpret_input_as_3d, depth_output_gemm3d);
+ auto config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
+ validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d);
+ ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(),
+ to_string(config.rhs_info).c_str());
+ return { config.lhs_info, config.rhs_info };
+}
+
inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type)
{
switch(kernel_type)
@@ -191,6 +229,7 @@ inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type)
case CLGEMMKernelType::NATIVE:
return false;
case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+ case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
return true;
default:
ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!");
@@ -202,6 +241,7 @@ ClGemmLowpMatrixMultiplyCore::ClGemmLowpMatrixMultiplyCore()
: _weights_to_qasymm8(std::make_unique<ClCastKernel>()),
_mm_native_kernel(std::make_unique<ClGemmLowpMatrixMultiplyNativeKernel>()),
_mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>()),
+ _mm_reshaped_only_rhs_mmul_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel>()),
_mtx_b_reshape_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()),
_mtx_a_reduction_kernel(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
_mtx_b_reduction_kernel(std::make_unique<ClGemmLowpMatrixBReductionKernel>()),
@@ -218,7 +258,7 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
const GEMMInfo &gemm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
- ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c != nullptr ? c : nullptr, output, gemm_info));
+ ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c, output, gemm_info));
ARM_COMPUTE_LOG_PARAMS(a, b, c, output, gemm_info);
_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
@@ -234,6 +274,7 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
// Set the target for the kernels
_mm_native_kernel->set_target(gpu_target);
_mm_reshaped_only_rhs_kernel->set_target(gpu_target);
+ _mm_reshaped_only_rhs_mmul_kernel->set_target(gpu_target);
GEMMRHSMatrixInfo rhs_info;
GEMMLHSMatrixInfo lhs_info;
@@ -249,8 +290,7 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
- // Check if we need to reshape the matrix A and matrix B
- _is_gemm_reshaped = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run));
+ _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run);
if(_convert_to_qasymm8)
{
@@ -261,7 +301,7 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
}
ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
- if(_is_gemm_reshaped)
+ if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
{
matrix_b = &_tmp_b;
@@ -274,6 +314,19 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
// Configure reshape RHS kernel
_mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
}
+ if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+ {
+ matrix_b = &_tmp_b;
+
+ // Pick up the GEMM configuration
+ // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+ std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d,
+ depth_output_gemm3d,
+ a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
+
+ // Configure reshape RHS kernel
+ _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
+ }
// Using default reduction info
const GEMMLowpReductionKernelInfo reduction_info {};
@@ -326,20 +379,30 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
gemm_kernel_info.output_stage = gemmlowp_output_stage;
- if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
{
// Configure and tune matrix multiply kernel with fused output stage
_mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
_b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
}
+ else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ {
+ // Configure and tune matrix multiply kernel with fused output stage
+ _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
+ _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+ }
else
{
_run_output_stage = true;
- if(_is_gemm_reshaped)
+ if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
{
_mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);
}
+ if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+ {
+ _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);
+ }
else
{
// Pick up the GEMM configuration
@@ -359,11 +422,16 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
else
{
_run_offset_contribution = true;
- if(_is_gemm_reshaped)
+ if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
{
// Configure and tune matrix multiply kernel
_mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
}
+ else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+ {
+ // Configure and tune matrix multiply kernel
+ _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
+ }
else
{
// Pick up the GEMM configuration
@@ -382,7 +450,7 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
// Request memory
_aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
- if(_is_gemm_reshaped)
+ if(is_gemm_reshaped(_gemm_kernel_type))
{
// Overwrite Rhs as prepare if gemm is reshaped as there will be a two-step transformation
_aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
@@ -607,7 +675,7 @@ void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
const ITensor *matrix_a = a;
const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.get() : b;
- if(_is_gemm_reshaped)
+ if(is_gemm_reshaped(_gemm_kernel_type))
{
matrix_b = tmp_b.get();
if(!_reshape_b_only_on_first_run)
@@ -645,7 +713,7 @@ void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
}
// Run matrix multiply
- if(_is_gemm_reshaped)
+ if(is_gemm_reshaped(_gemm_kernel_type))
{
ITensorPack gemm_reshaped_pack;
if(_run_offset_contribution)
@@ -669,7 +737,18 @@ void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
{ TensorType::ACL_DST, dst },
});
}
- CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false);
+ if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+ {
+ CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false);
+ }
+ else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+ {
+ CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_mmul_kernel, gemm_reshaped_pack, false);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Invalid reshaped kernel");
+ }
}
else
{
@@ -728,7 +807,7 @@ void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
b->mark_as_unused();
}
- if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
+ if(is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run)
{
// Run reshape kernel and mark original weights tensor as unused
ITensorPack mtx_b_pack =