From 4b90865ab985d571f70c60583cdfb8c7a65f1670 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Thu, 18 Oct 2018 10:21:02 +0100 Subject: COMPMID-1413 - Improve the performance of GEMMLowp with 8 bit dot product on OpenCL COMPMID-1424 - Add dot product support for CLDepthwise QASYMM8 3x3 NHWC non-unit stride With this patch we are able to improve the performance of MobileNet v1-qasymm8 by 37 % Tried to use the dot product instruction in CLDepthwise QASYMM8 3x3 NHWC non-unit stride but I have not seen any benefit (maybe because we have few arithemtic operation and we do not have more load instructions). However Depthwise convolution has been improved by 30% Change-Id: Id768a99c2e53a04276707e427af5d0ec93419ada Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/155082 Tested-by: bsgcomp Reviewed-by: Georgios Pinitas --- .../CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp | 107 ++++++++++++++++----- 1 file changed, 83 insertions(+), 24 deletions(-) (limited to 'src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp') diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp index f79fb43073..f2efb3249b 100644 --- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -42,7 +42,7 @@ inline bool is_interleaved_transposed(int m, int n, int k, bool reshape_b_only_o bool flag = true; if(gpu_target_is_in(gpu_target, - GPUTarget::G71, GPUTarget::G72, GPUTarget::G76, + GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G52, GPUTarget::G52LIT)) { @@ -56,6 +56,10 @@ inline bool is_interleaved_transposed(int m, int n, int k, bool reshape_b_only_o flag = false; } } + else + { + flag = m > 1; + } return flag; } @@ -69,24 +73,26 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptrinfo(), b->info(), output->info(), gemm_info)); + ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info)); _is_prepared = false; _original_b = b; @@ -108,6 +114,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo // in order to know how the matrices have been reshaped bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const bool unroll_block = dot8_supported(CLKernelLibrary::get().get_device()); const int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1); const int n = b->info()->dimension(0); const int k = a->info()->dimension(0); @@ -133,15 +140,11 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor } // Configure interleave kernel - _mtx_a_reshape_kernel.configure(a, &_tmp_a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()); + _mtx_a_reshape_kernel.configure(a, &_tmp_a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d(), unroll_block); // Configure transpose kernel _mtx_b_reshape_kernel.configure(b, &_tmp_b, mult_transpose1xW_width); } - // Configure matrix multiply kernel - _mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, - mult_transpose1xW_width, mult_interleave4x4_height, - depth_output_gemm3d, reinterpret_input_as_3d)); // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 if(_a_offset != 0) @@ -168,8 +171,34 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor _mtx_a_reduction_kernel.configure(a, &_vector_sum_row); } - // Configure offset contribution kernel - _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset); + // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage + if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) + { + _fuse_output_stage = true; + + _memory_group.manage(&_mm_result_s32); + + // Configure matrix multiply kernel + _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, + mult_transpose1xW_width, mult_interleave4x4_height, + depth_output_gemm3d, reinterpret_input_as_3d)); + + // Configure offset contribution kernel + _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0), + _a_offset, _b_offset, gemm_info.gemmlowp_output_stage()); + + _mm_result_s32.allocator()->allocate(); + } + else + { + // Configure matrix multiply kernel + _mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, + mult_transpose1xW_width, mult_interleave4x4_height, + depth_output_gemm3d, reinterpret_input_as_3d)); + + // Configure offset contribution kernel + _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, _b_offset); + } // Allocate tensors if(_is_interleaved_transposed) @@ -192,10 +221,9 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor } } -Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, const GEMMInfo &gemm_info) +Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); @@ -241,9 +269,6 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &tmp_b_info, mult_transpose1xW_width)); } - // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, reshape_matrices, reshape_info)); - TensorInfo info_vector_sum_col, info_vector_sum_row; // Validate matrix B reduction kernel only if _a_offset is not equal to 0 @@ -264,11 +289,37 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row)); } - // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - a_offset, b_offset)); + if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) + { + TensorInfo mm_result_s32_info{}; + + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_matrices, reshape_info)).set_data_type(DataType::S32)); + + // Validate matrix multiply + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, reshape_matrices, reshape_info)); + + // Validate offset contribution kernel + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info, + a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, + c, + output, + a_offset, b_offset, + gemm_info.gemmlowp_output_stage())); + } + else + { + // Validate matrix multiply + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, reshape_matrices, reshape_info)); + + // Validate offset contribution kernel + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output, + a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, + c, + a_offset, b_offset)); + } return Status{}; } @@ -306,8 +357,16 @@ void CLGEMMLowpMatrixMultiplyCore::run() CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false); } - // Run offset contribution kernel - CLScheduler::get().enqueue(_offset_contribution_kernel, true); + if(_fuse_output_stage) + { + // Run offset contribution/output stage kernel + CLScheduler::get().enqueue(_offset_contribution_output_stage_kernel, true); + } + else + { + // Run offset contribution kernel + CLScheduler::get().enqueue(_offset_contribution_kernel, true); + } _memory_group.release(); } -- cgit v1.2.1