diff options
author | Freddie Liardet <frederick.liardet@arm.com> | 2022-05-16 14:09:10 +0100 |
---|---|---|
committer | Gunes Bayir <gunes.bayir@arm.com> | 2022-07-22 10:18:41 +0000 |
commit | e572dff7adc334a98ac4a0326d66037451d5d079 (patch) | |
tree | 9c4db3d743078de9bda67dfed674e3f371a4e238 /src/gpu/cl/operators | |
parent | e87120731ca65c54b082734af07f748ac9651427 (diff) | |
download | ComputeLibrary-e572dff7adc334a98ac4a0326d66037451d5d079.tar.gz |
Add GemmLowp MMUL Reshaped Only Rhs Support for QASYMM8/QASYMM8_SIGNED
This patch introduces a GEMMLowp routine that is optimized for Arm(R) Mali(TM)-G715 and Arm(R) Mali(TM)-G615
Resolves: COMPMID-5398
Signed-off-by: Freddie Liardet <frederick.liardet@arm.com>
Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>
Change-Id: I8d06453645688f3658b6c7c06f1ebc25a2505661
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7932
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: SiCong Li <sicong.li@arm.com>
Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/gpu/cl/operators')
-rw-r--r-- | src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp | 129 | ||||
-rw-r--r-- | src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h | 38 |
2 files changed, 124 insertions, 43 deletions
diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp index 7a62186677..2622274587 100644 --- a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp +++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,23 +23,15 @@ */ #include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Log.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "arm_compute/runtime/CL/CLScheduler.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/kernels/ClCastKernel.h" #include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h" #include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h" +#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h" #include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h" #include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h" #include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h" @@ -47,9 +39,6 @@ #include "src/gpu/cl/utils/ClAuxTensorHandler.h" #include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h" -#include "src/common/utils/Log.h" -#include "utils/TypePrinter.h" - namespace arm_compute { namespace opencl @@ -67,6 +56,7 @@ inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type) { case CLGEMMKernelType::NATIVE: case CLGEMMKernelType::RESHAPED_ONLY_RHS: + case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL: { return true; } @@ -165,6 +155,41 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs return true; } +// Validate lhs_info and rhs_info for reshaped only rhs kernel +inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, + unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d) +{ + // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel + TensorInfo tmp_b_info{}; + // Validate reshape RHS kernel + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); + if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + { + return false; + } + // Validate mm kernel + // NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info + // NOTE: This assumes: + // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments). + // 2. lhs and rhs info does not cause window and padding issues through side effects (in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window). + GEMMKernelInfo gemm_kernel_info; + gemm_kernel_info.m = m; + gemm_kernel_info.n = n; + gemm_kernel_info.k = k; + gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; + gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d; + gemm_kernel_info.lhs_info = lhs_info; + gemm_kernel_info.rhs_info = rhs_info; + // Since we ignore the output stage, output data type has to be S32 to pass the validation + TensorInfo output_info_copy(*output); + output_info_copy.set_data_type(DataType::S32); + if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info))) + { + return false; + } + return true; +} + // Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d, const ITensorInfo *a, @@ -184,6 +209,19 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped return { config.lhs_info, config.rhs_info }; } +// Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d, + const ITensorInfo *a, + const ITensorInfo *b, const ITensorInfo *output) +{ + ARM_COMPUTE_UNUSED(a, b, output, reinterpret_input_as_3d, depth_output_gemm3d); + auto config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query); + validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), + to_string(config.rhs_info).c_str()); + return { config.lhs_info, config.rhs_info }; +} + inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type) { switch(kernel_type) @@ -191,6 +229,7 @@ inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type) case CLGEMMKernelType::NATIVE: return false; case CLGEMMKernelType::RESHAPED_ONLY_RHS: + case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL: return true; default: ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!"); @@ -202,6 +241,7 @@ ClGemmLowpMatrixMultiplyCore::ClGemmLowpMatrixMultiplyCore() : _weights_to_qasymm8(std::make_unique<ClCastKernel>()), _mm_native_kernel(std::make_unique<ClGemmLowpMatrixMultiplyNativeKernel>()), _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>()), + _mm_reshaped_only_rhs_mmul_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel>()), _mtx_b_reshape_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()), _mtx_a_reduction_kernel(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), _mtx_b_reduction_kernel(std::make_unique<ClGemmLowpMatrixBReductionKernel>()), @@ -218,7 +258,7 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); - ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c != nullptr ? c : nullptr, output, gemm_info)); + ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c, output, gemm_info)); ARM_COMPUTE_LOG_PARAMS(a, b, c, output, gemm_info); _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); @@ -234,6 +274,7 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con // Set the target for the kernels _mm_native_kernel->set_target(gpu_target); _mm_reshaped_only_rhs_kernel->set_target(gpu_target); + _mm_reshaped_only_rhs_mmul_kernel->set_target(gpu_target); GEMMRHSMatrixInfo rhs_info; GEMMLHSMatrixInfo lhs_info; @@ -249,8 +290,7 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d); - // Check if we need to reshape the matrix A and matrix B - _is_gemm_reshaped = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run)); + _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run); if(_convert_to_qasymm8) { @@ -261,7 +301,7 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con } ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b; - if(_is_gemm_reshaped) + if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) { matrix_b = &_tmp_b; @@ -274,6 +314,19 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con // Configure reshape RHS kernel _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info); } + if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) + { + matrix_b = &_tmp_b; + + // Pick up the GEMM configuration + // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d, + depth_output_gemm3d, + a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output); + + // Configure reshape RHS kernel + _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info); + } // Using default reduction info const GEMMLowpReductionKernelInfo reduction_info {}; @@ -326,20 +379,30 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con gemm_kernel_info.output_stage = gemmlowp_output_stage; - if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { // Configure and tune matrix multiply kernel with fused output stage _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); } + else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + // Configure and tune matrix multiply kernel with fused output stage + _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); + } else { _run_output_stage = true; - if(_is_gemm_reshaped) + if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) { _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info); } + if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) + { + _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info); + } else { // Pick up the GEMM configuration @@ -359,11 +422,16 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con else { _run_offset_contribution = true; - if(_is_gemm_reshaped) + if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) { // Configure and tune matrix multiply kernel _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info); } + else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) + { + // Configure and tune matrix multiply kernel + _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info); + } else { // Pick up the GEMM configuration @@ -382,7 +450,7 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con // Request memory _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _qasymm8_weights.total_size()); - if(_is_gemm_reshaped) + if(is_gemm_reshaped(_gemm_kernel_type)) { // Overwrite Rhs as prepare if gemm is reshaped as there will be a two-step transformation _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, _qasymm8_weights.total_size()); @@ -607,7 +675,7 @@ void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) const ITensor *matrix_a = a; const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.get() : b; - if(_is_gemm_reshaped) + if(is_gemm_reshaped(_gemm_kernel_type)) { matrix_b = tmp_b.get(); if(!_reshape_b_only_on_first_run) @@ -645,7 +713,7 @@ void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) } // Run matrix multiply - if(_is_gemm_reshaped) + if(is_gemm_reshaped(_gemm_kernel_type)) { ITensorPack gemm_reshaped_pack; if(_run_offset_contribution) @@ -669,7 +737,18 @@ void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) { TensorType::ACL_DST, dst }, }); } - CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false); + if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) + { + CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false); + } + else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) + { + CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_mmul_kernel, gemm_reshaped_pack, false); + } + else + { + ARM_COMPUTE_ERROR("Invalid reshaped kernel"); + } } else { @@ -728,7 +807,7 @@ void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors) b->mark_as_unused(); } - if(_is_gemm_reshaped && _reshape_b_only_on_first_run) + if(is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run) { // Run reshape kernel and mark original weights tensor as unused ITensorPack mtx_b_pack = diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h index 1965e3f97b..6fa4352bf8 100644 --- a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h +++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -40,6 +40,7 @@ namespace kernels class ClCastKernel; class ClGemmLowpMatrixMultiplyNativeKernel; class ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel; +class ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel; class ClGemmReshapeRhsMatrixKernel; class ClGemmLowpMatrixAReductionKernel; class ClGemmLowpMatrixBReductionKernel; @@ -120,14 +121,15 @@ private: private: // Kernels used - std::unique_ptr<kernels::ClCastKernel> _weights_to_qasymm8; - std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyNativeKernel> _mm_native_kernel; - std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_kernel; - std::unique_ptr<kernels::ClGemmReshapeRhsMatrixKernel> _mtx_b_reshape_kernel; - std::unique_ptr<kernels::ClGemmLowpMatrixAReductionKernel> _mtx_a_reduction_kernel; - std::unique_ptr<kernels::ClGemmLowpMatrixBReductionKernel> _mtx_b_reduction_kernel; - std::unique_ptr<kernels::ClGemmLowpOffsetContributionKernel> _offset_contribution_kernel; - std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel; + std::unique_ptr<kernels::ClCastKernel> _weights_to_qasymm8; + std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyNativeKernel> _mm_native_kernel; + std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_kernel; + std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel> _mm_reshaped_only_rhs_mmul_kernel; + std::unique_ptr<kernels::ClGemmReshapeRhsMatrixKernel> _mtx_b_reshape_kernel; + std::unique_ptr<kernels::ClGemmLowpMatrixAReductionKernel> _mtx_a_reduction_kernel; + std::unique_ptr<kernels::ClGemmLowpMatrixBReductionKernel> _mtx_b_reduction_kernel; + std::unique_ptr<kernels::ClGemmLowpOffsetContributionKernel> _offset_contribution_kernel; + std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel; // Temporary tensors TensorInfo _qasymm8_weights{}; @@ -138,15 +140,15 @@ private: TensorInfo _gemm_output_stage_multipliers{}; TensorInfo _gemm_output_stage_shifts{}; - int32_t _a_offset{ 0 }; - int32_t _b_offset{ 0 }; - bool _is_gemm_reshaped{ true }; - bool _reshape_b_only_on_first_run{ false }; - bool _run_output_stage{ false }; - bool _convert_to_qasymm8{ false }; - bool _run_offset_contribution{ false }; - bool _is_prepared{ false }; - GEMMInfo _gemm_info{}; + int32_t _a_offset{ 0 }; + int32_t _b_offset{ 0 }; + bool _reshape_b_only_on_first_run{ false }; + bool _run_output_stage{ false }; + bool _convert_to_qasymm8{ false }; + bool _run_offset_contribution{ false }; + bool _is_prepared{ false }; + GEMMInfo _gemm_info{}; + CLGEMMKernelType _gemm_kernel_type{}; experimental::MemoryRequirements _aux_mem{}; }; |