From f3622becf1f0d6bf5147ebb7d6d0f14d5252860a Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Mon, 29 Jul 2019 14:27:16 +0100 Subject: COMPMID-1979: Fuse Activation Function in CLGEMM - part 4 Fused activation function in CLGEMM Change-Id: I644fdf09349325c0b3a2cd5fef2a3ea2c974149d Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/1640 Comments-Addressed: Arm Jenkins Reviewed-by: Georgios Pinitas Tested-by: Arm Jenkins --- src/runtime/CL/functions/CLGEMM.cpp | 91 +++------- .../CL/functions/CLGEMMConvolutionLayer.cpp | 193 ++++++++++++--------- 2 files changed, 137 insertions(+), 147 deletions(-) (limited to 'src') diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index c0ccd0f451..e78395f1de 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -48,7 +48,6 @@ using namespace arm_compute::cl_gemm; CLGEMM::CLGEMM(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _mm_kernel(), - _ma_kernel(), _reshape_lhs_kernel(), _reshape_rhs_kernel(), _mm_reshaped_kernel(), @@ -56,7 +55,6 @@ CLGEMM::CLGEMM(std::shared_ptr memory_manager) _tmp_a(), _tmp_b(), _original_b(nullptr), - _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _gemm_type(GEMMType::NATIVE) @@ -118,10 +116,10 @@ void CLGEMM::configure_native(const ICLTensor *a, const ICLTensor *b, const ICLT // Set the target for the kernels _mm_kernel.set_target(gpu_target); - GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d()); + GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d(), gemm_info.broadcast_bias()); // Configure and tune matrix multiply kernel - _mm_kernel.configure(a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision()); + _mm_kernel.configure(a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info()); // Tune kernel statically CLScheduler::get().tune_kernel_static(_mm_kernel); @@ -162,7 +160,7 @@ void CLGEMM::configure_reshaped_v1(const ICLTensor *a, const ICLTensor *b, const lhs_info.interleave = true; lhs_info.transpose = true; - GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false); + GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias()); _memory_group.manage(&_tmp_a); if(!_reshape_b_only_on_first_run) @@ -177,7 +175,7 @@ void CLGEMM::configure_reshaped_v1(const ICLTensor *a, const ICLTensor *b, const _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info); // Configure and tune matrix multiply kernel - _mm_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision()); + _mm_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info()); CLScheduler::get().tune_kernel_static(_mm_kernel); @@ -200,13 +198,15 @@ void CLGEMM::configure_reshaped_v2(const ICLTensor *a, const ICLTensor *b, const const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); const GPUTarget gpu_target = CLScheduler::get().target(); bool broadcast_bias = gemm_info.broadcast_bias(); - GEMMKernelInfo kernel_info; + + GEMMKernelInfo kernel_info; kernel_info.m = m; kernel_info.n = n; kernel_info.k = k; kernel_info.depth_output_gemm3d = depth_output_gemm3d; kernel_info.reinterpret_input_as_3d = false; kernel_info.broadcast_bias = broadcast_bias; + kernel_info.activation_info = gemm_info.activation_info(); // Set the target for the kernels _reshape_lhs_kernel.set_target(gpu_target); @@ -255,13 +255,15 @@ void CLGEMM::configure_reshaped_only_rhs(const ICLTensor *a, const ICLTensor *b, const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); const GPUTarget gpu_target = CLScheduler::get().target(); bool broadcast_bias = gemm_info.broadcast_bias(); - GEMMKernelInfo kernel_info; + + GEMMKernelInfo kernel_info; kernel_info.m = m; kernel_info.n = n; kernel_info.k = k; kernel_info.depth_output_gemm3d = depth_output_gemm3d; kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; kernel_info.broadcast_bias = broadcast_bias; + kernel_info.activation_info = gemm_info.activation_info(); // Set the target for the kernels _mm_kernel.set_target(gpu_target); @@ -305,21 +307,12 @@ Status CLGEMM::validate_native(const ITensorInfo *a, const ITensorInfo *b, const const unsigned int n = b->dimension(0); const unsigned int k = a->dimension(0); const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const bool add_c = (beta != 0.f && c != nullptr); - const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f; - const bool fuse_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1); - const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d); + const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d, gemm_info.broadcast_bias()); // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, - false, reshape_info, gpu_target, gemm_info.fp_mixed_precision())); - - if(add_c && !fuse_add) - { - // Validate matrix addition kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta)); - } + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(a, b, c, output, alpha, beta, + false, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info())); return Status{}; } @@ -340,9 +333,6 @@ Status CLGEMM::validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, int mult_transpose1xW_width = 1; int mult_interleave4x4_height = 1; const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const bool add_c = (beta != 0.f && c != nullptr); - const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f; - const bool fuse_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1); if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST) { @@ -364,7 +354,7 @@ Status CLGEMM::validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, lhs_info.interleave = true; lhs_info.transpose = true; - const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false); + const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias()); // Validate interleave kernel auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d()))); @@ -375,14 +365,8 @@ Status CLGEMM::validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info)); // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, - true, reshape_info, gpu_target, gemm_info.fp_mixed_precision())); - - if(add_c && !fuse_add) - { - // Validate matrix addition kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta)); - } + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, + true, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info())); return Status{}; } @@ -405,13 +389,15 @@ Status CLGEMM::validate_reshaped_v2(const ITensorInfo *a, const ITensorInfo *b, const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); const bool broadcast_bias = gemm_info.broadcast_bias(); - GEMMKernelInfo kernel_info; + + GEMMKernelInfo kernel_info; kernel_info.m = m; kernel_info.n = n; kernel_info.k = k; kernel_info.depth_output_gemm3d = depth_output_gemm3d; kernel_info.reinterpret_input_as_3d = false; kernel_info.broadcast_bias = broadcast_bias; + kernel_info.activation_info = gemm_info.activation_info(); GEMMLHSMatrixInfo lhs_info; GEMMRHSMatrixInfo rhs_info; @@ -452,13 +438,15 @@ Status CLGEMM::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); const bool broadcast_bias = gemm_info.broadcast_bias(); - GEMMKernelInfo kernel_info; + + GEMMKernelInfo kernel_info; kernel_info.m = m; kernel_info.n = n; kernel_info.k = k; kernel_info.depth_output_gemm3d = depth_output_gemm3d; kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; kernel_info.broadcast_bias = broadcast_bias; + kernel_info.activation_info = gemm_info.activation_info(); GEMMLHSMatrixInfo lhs_info; GEMMRHSMatrixInfo rhs_info; @@ -501,9 +489,7 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * // Select GEMMType _gemm_type = select_gemm_type(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target); - const bool is_fuse_add_c_supported = (_gemm_type == GEMMType::RESHAPED_V2) || (_gemm_type == GEMMType::RESHAPED_ONLY_RHS); - const bool add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr); - const bool fuse_add_c = add_c && is_fuse_add_c_supported; + const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr); const ICLTensor *c_to_use = fuse_add_c ? c : nullptr; @@ -534,13 +520,6 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * ARM_COMPUTE_ERROR("GEMMType not supported"); } } - - // Configure matrix addition kernel - if(add_c && !fuse_add_c) - { - _ma_kernel.configure(c, output, beta); - _run_addition = true; - } } Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) @@ -555,9 +534,7 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso // Select GEMMType GEMMType gemm_type = select_gemm_type(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run(), gpu_target); - const bool is_fuse_add_c_supported = (gemm_type == GEMMType::RESHAPED_V2) || (gemm_type == GEMMType::RESHAPED_ONLY_RHS); - const bool add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr); - const bool fuse_add_c = add_c && is_fuse_add_c_supported; + const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr); const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr; @@ -589,12 +566,6 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso } } - // Validate matrix addition kernel - if(add_c && !fuse_add_c) - { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta)); - } - return Status{}; } @@ -609,7 +580,7 @@ void CLGEMM::run() { case GEMMType::NATIVE: { - CLScheduler::get().enqueue(_mm_kernel, !_run_addition); + CLScheduler::get().enqueue(_mm_kernel, true); break; } case GEMMType::RESHAPED_V1: @@ -623,7 +594,7 @@ void CLGEMM::run() CLScheduler::get().enqueue(_reshape_rhs_kernel, false); } - CLScheduler::get().enqueue(_mm_kernel, !_run_addition); + CLScheduler::get().enqueue(_mm_kernel, true); break; } case GEMMType::RESHAPED_V2: @@ -637,7 +608,7 @@ void CLGEMM::run() CLScheduler::get().enqueue(_reshape_rhs_kernel, false); } - CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition); + CLScheduler::get().enqueue(_mm_reshaped_kernel, true); break; } case GEMMType::RESHAPED_ONLY_RHS: @@ -648,7 +619,7 @@ void CLGEMM::run() CLScheduler::get().enqueue(_reshape_rhs_kernel, false); } - CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, !_run_addition); + CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, true); break; } default: @@ -656,12 +627,6 @@ void CLGEMM::run() ARM_COMPUTE_ERROR("GEMMType not supported"); } } - - // Run matrix addition kernel - if(_run_addition) - { - CLScheduler::get().enqueue(_ma_kernel); - } } void CLGEMM::prepare() diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index 99f045a0bf..be6be04703 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -91,22 +91,27 @@ void CLConvolutionLayerReshapeWeights::run() } CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr memory_manager) - : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _add_bias_kernel(), - _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _data_layout(DataLayout::NCHW), _append_bias(false), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), - _is_activationlayer_enabled(false), _is_prepared(false), _run_addition(true) + : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), + _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _is_prepared(false) { } void CLGEMMConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const GEMMLowpOutputStageInfo &gemmlowp_output_stage, - int gemm_3d_depth) + int gemm_3d_depth, const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights); - ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, - _run_addition)); - - const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, - gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, - false, gemmlowp_output_stage); + ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info)); + + const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped + false, // is_b_reshaped + true, // reshape_b_only_on_first_run + gemm_3d_depth, // depth_output_gemm3d + _skip_im2col, // reinterpret_input_as_3d + false, // retain_internal_weights + gemmlowp_output_stage, // gemmlowp_output_stage + false, // fp_mixed_precision + true, // broadcast_bias + act_info); // activation_info if(_is_quantized) { @@ -126,21 +131,26 @@ void CLGEMMConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTenso } else { - // Bias does not need to be added in GEMM if im2col is being used or the Matrix Addition kernel needs to be run - const bool skip_bias_in_gemm = _run_addition || !_skip_im2col; // Configure matrix multiply function - _mm_gemm.configure(input, weights, (skip_bias_in_gemm) ? nullptr : biases, output, 1.0f, 1.0f, gemm_info); + _mm_gemm.configure(input, weights, biases, output, 1.0f, 1.0f, gemm_info); } } Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, bool run_addition) + const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info) { const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); - const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, - gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, - false, gemmlowp_output_stage); + const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped + false, // is_b_reshaped + true, // reshape_b_only_on_first_run + gemm_3d_depth, // depth_output_gemm3d + skip_im2col, // reinterpret_input_as_3d + false, // retain_internal_weights + gemmlowp_output_stage, // gemmlowp_output_stage + false, // fp_mixed_precision + true, // broadcast_bias + act_info); // activation_info if(is_quantized) { @@ -159,10 +169,8 @@ Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITens } else { - // Bias does not need to be added in GEMM if im2col is being used or the Matrix Addition kernel needs to be run - const bool skip_bias_in_gemm = run_addition || !skip_im2col; // Perform validation step on Matrix multiply function - return CLGEMM::validate(input, weights, (skip_bias_in_gemm) ? nullptr : biases, output, 1.0f, 1.0f, gemm_info); + return CLGEMM::validate(input, weights, biases, output, 1.0f, 1.0f, gemm_info); } } @@ -194,15 +202,14 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform(); const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform(); - _is_prepared = weights_info.retain_internal_weights(); - _original_weights = weights; - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); - _data_layout = data_layout; - _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); - _skip_col2im = data_layout == DataLayout::NHWC; - _append_bias = (biases != nullptr) && (!_is_quantized); - _is_activationlayer_enabled = act_info.enabled(); - _run_addition = (_skip_im2col) && (_append_bias); + _is_prepared = weights_info.retain_internal_weights(); + _original_weights = weights; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); + _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); + _skip_col2im = data_layout == DataLayout::NHWC; + + // Only for quantize there are few cases where we cannot fuse the activation function in GEMM + _fuse_activation = true; // Set the GPU target for im2col and col2im _im2col_kernel.set_target(CLScheduler::get().target()); @@ -211,8 +218,6 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * const ICLTensor *gemm_input_to_use = input; ICLTensor *gemm_output_to_use = output; - const ICLTensor *biases_to_use = (_append_bias && !_skip_im2col) ? biases : nullptr; - // Get parameters from conv_info unsigned int stride_x = 0; unsigned int stride_y = 0; @@ -230,9 +235,22 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels) / num_groups; - // _weights_reshaped will be auto configured in the kernel. - // Just append biases and do not transpose 1xW as it will be reshaped in CLGEMM - _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, num_groups); + const ICLTensor *biases_to_use = biases; + bool append_bias = false; + + if(num_groups != 1 && biases != nullptr) + { + // num_groups != 1 can only be for NCHW + // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor + biases_to_use = nullptr; + append_bias = true; + + _reshape_weights.configure(weights, biases, &_weights_reshaped, num_groups); + } + else + { + _reshape_weights.configure(weights, nullptr, &_weights_reshaped, num_groups); + } // Create tensor to store im2col reshaped inputs if(!_skip_im2col) @@ -240,7 +258,7 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * _memory_group.manage(&_im2col_output); // Configure and tune im2col. im2col output shape is auto-initialized - _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, dilation, num_groups); + _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups); // Set quantization info _im2col_output.info()->set_quantization_info(input->info()->quantization_info()); @@ -249,11 +267,6 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * // Update GEMM input gemm_input_to_use = &_im2col_output; } - else if(_append_bias) - { - // Configure add bias kernel - _add_bias_kernel.configure(ArithmeticOperation::ADD, output, biases, output, ConvertPolicy::SATURATE); - } // Create GEMM output tensor if(!_skip_col2im) @@ -299,16 +312,20 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU }; - if(_is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0) + if(act_info.enabled()) { - const int a_const_int = quantize_qasymm8(act_info.a(), output_quant_info); - const int b_const_int = quantize_qasymm8(act_info.b(), output_quant_info); - - min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int; - max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int; - - // If the activation layer is RELU, BOUNDED_RELU or LU_BOUNDED_RELU, we can use the GEMMLowp output stage to perform this operation - _is_activationlayer_enabled = false; + if(supported_acts.count(act_info.activation()) != 0) + { + const int a_const_int = quantize_qasymm8(act_info.a(), output_quant_info); + const int b_const_int = quantize_qasymm8(act_info.b(), output_quant_info); + + min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int; + max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int; + } + else + { + _fuse_activation = false; + } } // Set the GEMMLowp output stage info @@ -323,7 +340,7 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0; - configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth); + configure_mm(gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, act_info); if(!_skip_im2col) { @@ -345,7 +362,7 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h), "Output shape does not match the expected one"); - if(_is_activationlayer_enabled) + if(!_fuse_activation) { _activationlayer_function.configure(output, nullptr, act_info); } @@ -382,12 +399,10 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI const ITensorInfo *gemm_output_to_use = output; const ITensorInfo *weights_to_use = weights; - const bool is_quantized = is_data_type_quantized_asymmetric(data_type); - const bool append_bias = (biases != nullptr) && (!is_quantized); - const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); - const bool skip_col2im = data_layout == DataLayout::NHWC; - bool is_activationlayer_enabled = act_info.enabled(); - const bool run_addition = (skip_im2col) && (append_bias); + const bool is_quantized = is_data_type_quantized_asymmetric(data_type); + const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); + const bool skip_col2im = data_layout == DataLayout::NHWC; + bool fuse_activation = true; const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); @@ -429,10 +444,26 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI unsigned int mat_weights_cols = weights->dimension(idx_kernels) / num_groups; - // Output tensor auto inizialitation if not yet initialized - ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized ? nullptr : biases, nullptr, num_groups)); - weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, (append_bias && !skip_im2col), num_groups), 1, data_type); - weights_to_use = &weights_reshaped_info; + const ITensorInfo *biases_to_use = biases; + bool append_bias = false; + + if(num_groups != 1 && biases != nullptr) + { + // num_groups != 1 can only be for NCHW + // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor + biases_to_use = nullptr; + append_bias = true; + + ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, biases, nullptr, num_groups)); + weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, true, num_groups), 1, data_type); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, nullptr, nullptr, num_groups)); + weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, false, num_groups), 1, data_type); + } + + weights_to_use = &weights_reshaped_info; if(!skip_im2col) { @@ -446,11 +477,6 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, kernel_dims, conv_info, append_bias, dilation, num_groups)); gemm_input_to_use = &im2col_reshaped_info; } - else if(run_addition) - { - // Validate add bias kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, output, biases, output, ConvertPolicy::SATURATE)); - } // Create GEMM output tensor if(!skip_col2im) @@ -490,16 +516,20 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU }; - if(is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0) + if(act_info.enabled()) { - const int a_const_int = quantize_qasymm8(act_info.a(), output_quant_info); - const int b_const_int = quantize_qasymm8(act_info.b(), output_quant_info); - - min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int; - max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int; - - // If the activation layer is RELU, BOUNDED_RELU or LU_BOUNDED_RELU, we can use the GEMMLowp output stage to perform this operation - is_activationlayer_enabled = false; + if(supported_acts.count(act_info.activation()) != 0) + { + const int a_const_int = quantize_qasymm8(act_info.a(), output_quant_info); + const int b_const_int = quantize_qasymm8(act_info.b(), output_quant_info); + + min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int; + max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int; + } + else + { + fuse_activation = false; + } } // Set the GEMMLowp output stage info @@ -513,7 +543,7 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0; - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, run_addition)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, act_info)); // Validate Col2Im if(!skip_col2im) @@ -522,7 +552,7 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI } //Validate Activation Layer - if(is_activationlayer_enabled) + if(!fuse_activation) { ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info)); } @@ -554,19 +584,14 @@ void CLGEMMConvolutionLayer::run() _mm_gemm.run(); } - if(_run_addition) - { - CLScheduler::get().enqueue(_add_bias_kernel); - } - // Reshape output matrix if(!_skip_col2im) { CLScheduler::get().enqueue(_col2im_kernel, false); } - //Run Activation Layer if enabled - if(_is_activationlayer_enabled) + //Run Activation Layer if we cannot fuse in GEMM + if(!_fuse_activation) { _activationlayer_function.run(); } -- cgit v1.2.1