From eb65f6da695ac0d3e495817145cceb1c4de4f048 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Wed, 15 Apr 2020 11:42:15 +0100 Subject: COMPMID-3304: Update OpenCL GEMM heuristic for Int8 Change-Id: I6b7ff678d8d0437a1639db2ff602ea1cdb155464 Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3056 Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas Comments-Addressed: Arm Jenkins --- .../CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp | 110 ++++++++------------- .../CL/gemm/CLGEMMKernelSelectionBifrost.cpp | 20 +--- .../CL/gemm/CLGEMMKernelSelectionMidgard.cpp | 5 +- .../CL/gemm/CLGEMMKernelSelectionValhall.cpp | 13 +-- 4 files changed, 52 insertions(+), 96 deletions(-) (limited to 'src/runtime/CL') diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp index 90e5698fd8..ef17f110d0 100644 --- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -35,6 +35,7 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h" namespace arm_compute { @@ -43,16 +44,33 @@ using namespace arm_compute::cl_gemm; namespace { -inline bool is_gemm_reshaped(bool reshape_b_only_on_first_run, GPUTarget gpu_target) +inline bool is_gemm_reshaped(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run) { - return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (reshape_b_only_on_first_run); + std::unique_ptr gemm_kernel = CLGEMMKernelSelectionFactory::create(CLScheduler::get().target()); + ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_kernel.get()); + + CLGEMMKernelSelectionParams params; + params.m = m; + params.n = n; + params.k = k; + params.is_rhs_constant = reshape_b_only_on_first_run; + params.data_type = data_type; + + switch(gemm_kernel->select_kernel(params)) + { + case CLGEMMKernelType::NATIVE: + return false; + case CLGEMMKernelType::RESHAPED_ONLY_RHS: + return true; + default: + ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!"); + } } } // namespace CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _weights_to_qasymm8(), - _mm_midgard_kernel(), _mm_native_kernel(), _mm_reshaped_only_rhs_kernel(), _mtx_b_reshape_kernel(), @@ -73,7 +91,6 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptrinfo()->data_type(), _reshape_b_only_on_first_run); if(_convert_to_qasymm8) { @@ -220,19 +235,12 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor } else { - if(_is_midgard) - { - // Configure matrix multiply kernel - _mm_midgard_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); - } - else - { - // Pick up the GEMM configuration - std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); - - // Configure matrix multiply kernel - _mm_native_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); - } + // Pick up the GEMM configuration + std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); + + // Configure matrix multiply kernel + _mm_native_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); + _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0), _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); @@ -260,19 +268,11 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor } else { - if(_is_midgard) - { - // Configure matrix multiply kernel - _mm_midgard_kernel.configure(_matrix_a, matrix_b, output, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); - } - else - { - // Pick up the GEMM configuration - std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); + // Pick up the GEMM configuration + std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); - // Configure matrix multiply kernel - _mm_native_kernel.configure(_matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); - } + // Configure matrix multiply kernel + _mm_native_kernel.configure(_matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); } // Configure offset contribution kernel @@ -329,9 +329,8 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso const unsigned int k = a->dimension(0); const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const bool is_midgard = gpu_target == GPUTarget::MIDGARD; - bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target()); + bool reshape_matrix_b = is_gemm_reshaped(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run()); const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d); @@ -425,19 +424,11 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso // Output tensor auto inizialitation if not yet initialized auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32)); - if(is_midgard) - { - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, reshape_info)); - } - else - { - // Pick up the GEMM configuration - std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); - - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)); - } + // Pick up the GEMM configuration + std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); + + // Validate matrix multiply + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)); } // Validate offset contribution kernel @@ -461,19 +452,11 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso } else { - if(is_midgard) - { - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, reshape_info)); - } - else - { - // Pick up the GEMM configuration - std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); + // Pick up the GEMM configuration + std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info)); - } + // Validate matrix multiply + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info)); } if(output->total_size() != 0) @@ -524,14 +507,7 @@ void CLGEMMLowpMatrixMultiplyCore::run() } else { - if(_is_midgard) - { - CLScheduler::get().enqueue(_mm_midgard_kernel, false); - } - else - { - CLScheduler::get().enqueue(_mm_native_kernel, false); - } + CLScheduler::get().enqueue(_mm_native_kernel, false); } if(_run_output_stage) { diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp index d30eaa9edc..041e7d6cb4 100644 --- a/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp +++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp @@ -165,27 +165,15 @@ CLGEMMKernelType CLGEMMKernelSelectionBifrost::default_f16(unsigned int m, unsig CLGEMMKernelType CLGEMMKernelSelectionBifrost::default_q8(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant) { + ARM_COMPUTE_UNUSED(m, n, k); + if(is_rhs_constant) { - if(m == 1) - { - if((n > k) && gpu_target_is_in(_target, GPUTarget::G71)) - { - return CLGEMMKernelType::NATIVE_V1; - } - else - { - return CLGEMMKernelType::RESHAPED_ONLY_RHS; - } - } - else - { - return CLGEMMKernelType::RESHAPED; - } + return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - return CLGEMMKernelType::NATIVE_V1; + return CLGEMMKernelType::NATIVE; } } diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp index b7bb720175..a94a392553 100644 --- a/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp +++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp @@ -86,10 +86,9 @@ CLGEMMKernelType CLGEMMKernelSelectionMidgard::default_f16(unsigned int m, unsig CLGEMMKernelType CLGEMMKernelSelectionMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant) { - ARM_COMPUTE_UNUSED(n, k); + ARM_COMPUTE_UNUSED(m, n, k, is_rhs_constant); - // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once - return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED_V1 : CLGEMMKernelType::NATIVE_V1; + return CLGEMMKernelType::NATIVE; } } // namespace cl_gemm } // namespace arm_compute diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp index 8016417eb9..775bb9bffd 100644 --- a/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp +++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp @@ -83,22 +83,15 @@ CLGEMMKernelType CLGEMMKernelSelectionValhall::default_f16(unsigned int m, unsig CLGEMMKernelType CLGEMMKernelSelectionValhall::default_q8(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant) { - ARM_COMPUTE_UNUSED(n, k); + ARM_COMPUTE_UNUSED(m, n, k); if(is_rhs_constant) { - if(m == 1) - { - return CLGEMMKernelType::RESHAPED_ONLY_RHS; - } - else - { - return CLGEMMKernelType::RESHAPED; - } + return CLGEMMKernelType::RESHAPED_ONLY_RHS; } else { - return CLGEMMKernelType::NATIVE_V1; + return CLGEMMKernelType::NATIVE; } } } // namespace cl_gemm -- cgit v1.2.1