From 06be6f8d2a316a307fa623150f8adf8f9c3416c5 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Mon, 24 Jun 2019 17:47:51 +0100 Subject: COMPMID-2096: Refactor the CLGEMMLowp function selection (heuristic) Change-Id: I15a8b39e0354d3b6686ed4cc8c361782c0512037 Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/1410 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: VidhyaSudhan Loganathan --- .../CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp | 82 +++++++++++++++++++--- 1 file changed, 71 insertions(+), 11 deletions(-) (limited to 'src/runtime') diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp index 875e3a2a00..0286cb3d6d 100644 --- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h" #include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" @@ -48,7 +49,8 @@ inline bool is_gemm_reshaped(bool reshape_b_only_on_first_run, GPUTarget gpu_tar CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), - _mm_kernel(), + _mm_midgard_kernel(), + _mm_native_kernel(), _mm_reshaped_only_rhs_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), @@ -63,6 +65,7 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptrconfigure(m, n, k, batch_size, DataType::QASYMM8); + + // Configure matrix multiply kernel + _mm_native_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); + } } // Configure offset contribution kernel @@ -178,8 +195,19 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor } else { - // Configure matrix multiply kernel - _mm_kernel.configure(matrix_a, matrix_b, output, false, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); + if(_is_midgard) + { + // Configure matrix multiply kernel + _mm_midgard_kernel.configure(matrix_a, matrix_b, output, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); + } + else + { + // Pick up the GEMM configuration + std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); + + // Configure matrix multiply kernel + _mm_native_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); + } } // Configure offset contribution kernel @@ -232,6 +260,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso const unsigned int k = a->dimension(0); const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const bool is_midgard = gpu_target == GPUTarget::MIDGARD; bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target()); @@ -287,9 +316,21 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso // Output tensor auto inizialitation if not yet initialized auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32)); - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, false, reshape_info)); + if(is_midgard) + { + // Validate matrix multiply + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, reshape_info)); + } + else + { + // Pick up the GEMM configuration + std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); + + // Validate matrix multiply + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)); + } } + // Validate offset contribution kernel ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col, @@ -308,9 +349,21 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso } else { - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, false, reshape_info)); + if(is_midgard) + { + // Validate matrix multiply + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, reshape_info)); + } + else + { + // Pick up the GEMM configuration + std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); + + // Validate matrix multiply + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info)); + } } + if(output->total_size() != 0) { // Validate offset contribution kernel @@ -353,7 +406,14 @@ void CLGEMMLowpMatrixMultiplyCore::run() } else { - CLScheduler::get().enqueue(_mm_kernel, false); + if(_is_midgard) + { + CLScheduler::get().enqueue(_mm_midgard_kernel, false); + } + else + { + CLScheduler::get().enqueue(_mm_native_kernel, false); + } } // Run matrix A reduction kernel only if _b_offset is not equal to 0 -- cgit v1.2.1