diff options
author | Michele Di Giorgio <michele.digiorgio@arm.com> | 2018-11-16 16:04:25 +0000 |
---|---|---|
committer | Michele Di Giorgio <michele.digiorgio@arm.com> | 2019-02-07 09:44:08 +0000 |
commit | ebc3a90721fe4a41b8e141466894d4d7185c01b7 (patch) | |
tree | 9149764caa37edbdc6bb6c69d503d37dbb28449f /src/runtime/CL/functions/CLGEMM.cpp | |
parent | 4632e5e44e9a78b15884d0947007bb030fde0aea (diff) | |
download | ComputeLibrary-ebc3a90721fe4a41b8e141466894d4d7185c01b7.tar.gz |
COMPMID-1706: Fuse the bias addition within CLGEMM
Change-Id: I378f2023f4fa010f195f76716ac07aa86279bfae
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/280
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Diffstat (limited to 'src/runtime/CL/functions/CLGEMM.cpp')
-rw-r--r-- | src/runtime/CL/functions/CLGEMM.cpp | 20 |
1 files changed, 14 insertions, 6 deletions
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index cd40fc63c4..e91038f9a2 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -160,6 +160,10 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * const auto workload = static_cast<float>((m * n) / 20.0f); _is_new_gemm_reshaped = (workload > 1600.0f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && _is_interleaved_transposed && (data_type == DataType::F32); + const bool add_matrix_c = (beta != 0.f && c != nullptr); + const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f; + const bool use_fused_add = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !_is_new_gemm_reshaped; + // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D if(_is_interleaved_transposed) { @@ -202,9 +206,8 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * if(!_is_new_gemm_reshaped) { // Configure and tune matrix multiply kernel - _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, - mult_transpose1xW_width, mult_interleave4x4_height, - depth_output_gemm3d, reinterpret_input_as_3d), + _mm_kernel.configure(matrix_a, matrix_b, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta, _is_interleaved_transposed, + GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d), gemm_info.fp_mixed_precision()); CLScheduler::get().tune_kernel_static(_mm_kernel); } @@ -220,7 +223,7 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * } // Configure matrix addition kernel - if(beta != 0 && c != nullptr) + if(add_matrix_c && !use_fused_add) { _ma_kernel.configure(c, output, beta); _run_addition = true; @@ -284,6 +287,10 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso const auto workload = static_cast<float>((m * n) / 20.0f); const bool is_new_gemm_reshaped = (workload > 1600.f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && run_interleave_transpose && (data_type == DataType::F32); + const bool add_matrix_c = (beta != 0.f && c != nullptr); + const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f; + const bool use_fused_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1) && !is_new_gemm_reshaped; + // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D if(run_interleave_transpose) { @@ -328,10 +335,11 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso if(!is_new_gemm_reshaped) { // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, alpha, run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision())); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta, + run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision())); } - if(beta != 0 && c != nullptr) + if(add_matrix_c && !use_fused_add) { // Validate matrix addition kernel ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta)); |