From ebc3a90721fe4a41b8e141466894d4d7185c01b7 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Fri, 16 Nov 2018 16:04:25 +0000 Subject: COMPMID-1706: Fuse the bias addition within CLGEMM Change-Id: I378f2023f4fa010f195f76716ac07aa86279bfae Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/280 Tested-by: Arm Jenkins Reviewed-by: Gian Marco Iodice --- src/runtime/CL/functions/CLGEMM.cpp | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'src/runtime/CL/functions/CLGEMM.cpp') diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index cd40fc63c4..e91038f9a2 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -160,6 +160,10 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * const auto workload = static_cast((m * n) / 20.0f); _is_new_gemm_reshaped = (workload > 1600.0f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && _is_interleaved_transposed && (data_type == DataType::F32); + const bool add_matrix_c = (beta != 0.f && c != nullptr); + const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f; + const bool use_fused_add = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !_is_new_gemm_reshaped; + // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D if(_is_interleaved_transposed) { @@ -202,9 +206,8 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * if(!_is_new_gemm_reshaped) { // Configure and tune matrix multiply kernel - _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, - mult_transpose1xW_width, mult_interleave4x4_height, - depth_output_gemm3d, reinterpret_input_as_3d), + _mm_kernel.configure(matrix_a, matrix_b, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta, _is_interleaved_transposed, + GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d), gemm_info.fp_mixed_precision()); CLScheduler::get().tune_kernel_static(_mm_kernel); } @@ -220,7 +223,7 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * } // Configure matrix addition kernel - if(beta != 0 && c != nullptr) + if(add_matrix_c && !use_fused_add) { _ma_kernel.configure(c, output, beta); _run_addition = true; @@ -284,6 +287,10 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso const auto workload = static_cast((m * n) / 20.0f); const bool is_new_gemm_reshaped = (workload > 1600.f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && run_interleave_transpose && (data_type == DataType::F32); + const bool add_matrix_c = (beta != 0.f && c != nullptr); + const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f; + const bool use_fused_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1) && !is_new_gemm_reshaped; + // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D if(run_interleave_transpose) { @@ -328,10 +335,11 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso if(!is_new_gemm_reshaped) { // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, alpha, run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision())); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta, + run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision())); } - if(beta != 0 && c != nullptr) + if(add_matrix_c && !use_fused_add) { // Validate matrix addition kernel ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta)); -- cgit v1.2.1