From ebc3a90721fe4a41b8e141466894d4d7185c01b7 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Fri, 16 Nov 2018 16:04:25 +0000 Subject: COMPMID-1706: Fuse the bias addition within CLGEMM Change-Id: I378f2023f4fa010f195f76716ac07aa86279bfae Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/280 Tested-by: Arm Jenkins Reviewed-by: Gian Marco Iodice --- src/runtime/CL/functions/CLGEMM.cpp | 20 +++++++++++----- .../CL/functions/CLGEMMConvolutionLayer.cpp | 27 ++++++++++++++-------- 2 files changed, 32 insertions(+), 15 deletions(-) (limited to 'src/runtime/CL') diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index cd40fc63c4..e91038f9a2 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -160,6 +160,10 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * const auto workload = static_cast((m * n) / 20.0f); _is_new_gemm_reshaped = (workload > 1600.0f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && _is_interleaved_transposed && (data_type == DataType::F32); + const bool add_matrix_c = (beta != 0.f && c != nullptr); + const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f; + const bool use_fused_add = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !_is_new_gemm_reshaped; + // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D if(_is_interleaved_transposed) { @@ -202,9 +206,8 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * if(!_is_new_gemm_reshaped) { // Configure and tune matrix multiply kernel - _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, - mult_transpose1xW_width, mult_interleave4x4_height, - depth_output_gemm3d, reinterpret_input_as_3d), + _mm_kernel.configure(matrix_a, matrix_b, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta, _is_interleaved_transposed, + GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d), gemm_info.fp_mixed_precision()); CLScheduler::get().tune_kernel_static(_mm_kernel); } @@ -220,7 +223,7 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * } // Configure matrix addition kernel - if(beta != 0 && c != nullptr) + if(add_matrix_c && !use_fused_add) { _ma_kernel.configure(c, output, beta); _run_addition = true; @@ -284,6 +287,10 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso const auto workload = static_cast((m * n) / 20.0f); const bool is_new_gemm_reshaped = (workload > 1600.f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && run_interleave_transpose && (data_type == DataType::F32); + const bool add_matrix_c = (beta != 0.f && c != nullptr); + const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f; + const bool use_fused_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1) && !is_new_gemm_reshaped; + // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D if(run_interleave_transpose) { @@ -328,10 +335,11 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso if(!is_new_gemm_reshaped) { // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, alpha, run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision())); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta, + run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision())); } - if(beta != 0 && c != nullptr) + if(add_matrix_c && !use_fused_add) { // Validate matrix addition kernel ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta)); diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index 3a8b1a5891..7105e85061 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -93,7 +93,7 @@ void CLConvolutionLayerReshapeWeights::run() CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr memory_manager) : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _add_bias_kernel(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _data_layout(DataLayout::NCHW), _append_bias(false), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), - _is_activationlayer_enabled(false), _is_prepared(false) + _is_activationlayer_enabled(false), _is_prepared(false), _run_addition(true) { } @@ -101,7 +101,8 @@ void CLGEMMConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTenso int gemm_3d_depth) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); - ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col)); + ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, + _run_addition)); const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, @@ -125,13 +126,15 @@ void CLGEMMConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTenso } else { + // Bias does not need to be added in GEMM if im2col is being used or the Matrix Addition kernel needs to be run + const bool skip_bias_in_gemm = _run_addition || !_skip_im2col; // Configure matrix multiply function - _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info); + _mm_gemm.configure(input, weights, (skip_bias_in_gemm) ? nullptr : biases, output, 1.0f, 1.0f, gemm_info); } } Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col) + const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, bool run_addition) { const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); @@ -156,8 +159,10 @@ Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITens } else { + // Bias does not need to be added in GEMM if im2col is being used or the Matrix Addition kernel needs to be run + const bool skip_bias_in_gemm = run_addition || !skip_im2col; // Perform validation step on Matrix multiply function - return CLGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info); + return CLGEMM::validate(input, weights, (skip_bias_in_gemm) ? nullptr : biases, output, 1.0f, 1.0f, gemm_info); } } @@ -193,6 +198,8 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * _skip_col2im = data_layout == DataLayout::NHWC; _append_bias = (biases != nullptr) && (!_is_quantized); _is_activationlayer_enabled = act_info.enabled(); + // In case of F16, fused bias will be used in GEMM + _run_addition = (_skip_im2col) && (_append_bias) && (data_type != DataType::F16); // Set the GPU target for im2col and col2im _im2col_kernel.set_target(CLScheduler::get().target()); @@ -375,6 +382,8 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); const bool skip_col2im = data_layout == DataLayout::NHWC; bool is_activationlayer_enabled = act_info.enabled(); + // In case of F16, fused bias will be used in GEMM + const bool run_addition = (skip_im2col) && (append_bias) && (data_type != DataType::F16); ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * num_groups) != input->dimension(idx_channel)); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); @@ -429,7 +438,7 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, kernel_dims, conv_info, append_bias, dilation, num_groups)); gemm_input_to_use = &im2col_reshaped_info; } - else if(append_bias) + else if(run_addition) { // Validate add bias kernel ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, output, biases, output, ConvertPolicy::SATURATE)); @@ -496,7 +505,7 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0; - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, run_addition)); // Validate Col2Im if(!skip_col2im) @@ -537,7 +546,7 @@ void CLGEMMConvolutionLayer::run() _mm_gemm.run(); } - if(_skip_im2col && _append_bias) + if(_run_addition) { CLScheduler::get().enqueue(_add_bias_kernel); } -- cgit v1.2.1