From 8b72199f25487040713d1668c998fdde3707413c Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Mon, 28 Oct 2019 16:24:28 +0000 Subject: COMPMID-1889: Fuse bias addition and output stage in CLFCLayer. Delegates bias addition and output stage calculation in CLGEMMLowp/CLGEMM respectively. Change-Id: Ifd1f75b34eae766d3df80f07aec35fab45733e0b Signed-off-by: Georgios Pinitas Reviewed-on: https://review.mlplatform.org/c/2175 Reviewed-by: Michele Di Giorgio Reviewed-by: Gian Marco Iodice Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- .../runtime/CL/functions/CLFullyConnectedLayer.h | 10 +- src/runtime/CL/functions/CLFullyConnectedLayer.cpp | 173 ++++++++++----------- 2 files changed, 84 insertions(+), 99 deletions(-) diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h index 9512b22c08..7f872532e4 100644 --- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h +++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h @@ -174,9 +174,9 @@ public: void prepare() override; private: - void configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool retain_internal_weights); - void configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool retain_internal_weights); - void configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool retain_internal_weights); + void configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, bool retain_internal_weights); + void configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, bool retain_internal_weights); + void configure_mm(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, bool retain_internal_weights); MemoryGroup _memory_group; IWeightsManager *_weights_manager; @@ -187,16 +187,12 @@ private: CLFullyConnectedLayerReshapeWeights _reshape_weights_function; CLGEMM _mm_gemm; CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp; - CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage; - CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; // TODO(COMPMID-1889): Use CLGEMM to add bias in CLFullyConnectedLayer CLTensor _flatten_output; - CLTensor _gemmlowp_output; CLTensor _converted_weights_output; CLTensor _reshape_weights_output; bool _are_weights_converted; bool _are_weights_reshaped; bool _is_fc_after_conv; - bool _accumulate_biases; bool _is_quantized; bool _is_prepared; const ICLTensor *_original_weights; diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index 91f722fdce..6ad4cd5468 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -40,8 +40,55 @@ using namespace arm_compute::utils::cast; namespace { -Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output) +Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output, + GEMMLowpOutputStageInfo &gemmlowp_output_stage) { + gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + gemmlowp_output_stage.gemmlowp_offset = 0; + gemmlowp_output_stage.gemmlowp_multiplier = 0; + gemmlowp_output_stage.gemmlowp_shift = 0; + + // Configure output stage for quantized case + if(is_data_type_quantized_asymmetric(input.data_type())) + { + const UniformQuantizationInfo iq_info = input.quantization_info().uniform(); + const UniformQuantizationInfo wq_info = weights.quantization_info().uniform(); + const UniformQuantizationInfo oq_info = output.quantization_info().uniform(); + + const auto output_quant_info = (output.total_size() == 0) ? iq_info : oq_info; + + const float multiplier = (iq_info.scale * wq_info.scale) / output_quant_info.scale; + int output_multiplier = 0; + int output_shift = 0; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift)); + + // Set the GEMMLowp output stage info + gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; + gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier; + gemmlowp_output_stage.gemmlowp_shift = output_shift; + gemmlowp_output_stage.gemmlowp_min_bound = 0; + gemmlowp_output_stage.gemmlowp_max_bound = 255; + } + + return Status{}; +} + +Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &output) +{ + GEMMLowpOutputStageInfo gemmlowp_output_stage; + ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage)); + + const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped + false, // is_b_reshaped + true, // reshape_b_only_on_first_run + 0, // depth_output_gemm3d + false, // reinterpret_input_as_3d + false, // retain_internal_weights + gemmlowp_output_stage, // gemmlowp_output_stage + false, // fp_mixed_precision + true, // broadcast_bias + ActivationLayerInfo()); // activation_info + if(is_data_type_quantized_asymmetric(input.data_type())) { const UniformQuantizationInfo iq_info = input.quantization_info().uniform(); @@ -55,12 +102,13 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I // Validate gemmlowp function ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input.clone()->set_quantization_info(input_quantization_info), &weights.clone()->set_quantization_info(weights_quantization_info), - nullptr, - &output)); + bias, + &output, + gemm_info)); } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input, &weights, nullptr, &output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */))); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info)); } return Status{}; @@ -81,13 +129,26 @@ Status CLFullyConnectedLayerReshapeWeights::validate(const ITensorInfo *input, c CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr memory_manager, IWeightsManager *weights_manager) : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(), _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(), _reshape_weights_function(), - _mm_gemm(memory_manager, weights_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), _converted_weights_output(), - _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false), _is_prepared(false), - _original_weights(nullptr) + _mm_gemm(memory_manager, weights_manager), _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(), _reshape_weights_output(), _are_weights_converted(true), + _are_weights_reshaped(true), _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr) { } -void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool retain_internal_weights) +void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, bool retain_internal_weights) { + GEMMLowpOutputStageInfo gemmlowp_output_stage; + construct_gemmlowp_output_stage(*input->info(), *weights->info(), *output->info(), gemmlowp_output_stage); + + const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped + false, // is_b_reshaped + true, // reshape_b_only_on_first_run + 0, // depth_output_gemm3d + false, // reinterpret_input_as_3d + retain_internal_weights, // retain_internal_weights + gemmlowp_output_stage, // gemmlowp_output_stage + false, // fp_mixed_precision + true, // broadcast_bias + ActivationLayerInfo()); // activation_info + if(_is_quantized) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() @@ -99,7 +160,7 @@ void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); // Configure gemmlowp function - _mm_gemmlowp.configure(input, weights, nullptr, output); + _mm_gemmlowp.configure(input, weights, bias, output, gemm_info); // Revert back QuantizatioInfo as input and weights could be used in other fully connected layers input->info()->set_quantization_info(input_quantization_info); @@ -108,11 +169,11 @@ void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor else { // Configure matrix multiply kernel - _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */, 0, false, retain_internal_weights)); + _mm_gemm.configure(input, weights, bias, output, 1.f, 1.f, gemm_info); } } -void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool retain_internal_weights) +void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, bool retain_internal_weights) { ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); @@ -127,18 +188,18 @@ void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLT _flatten_layer.configure(input, &_flatten_output); // Configure matrix multiply kernel - configure_mm(&_flatten_output, weights, output, retain_internal_weights); + configure_mm(&_flatten_output, weights, bias, output, retain_internal_weights); // Allocate the output tensor for flatten once all the configure methods have been called _flatten_output.allocator()->allocate(); } -void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool retain_internal_weights) +void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, bool retain_internal_weights) { ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); // Configure matrix multiply kernel - configure_mm(input, weights, output, retain_internal_weights); + configure_mm(input, weights, bias, output, retain_internal_weights); } void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, @@ -156,7 +217,6 @@ void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *w _are_weights_converted = true; _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; _is_fc_after_conv = true; - _accumulate_biases = false; _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); _is_prepared = fc_info.retain_internal_weights; _original_weights = weights; @@ -166,24 +226,6 @@ void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *w _weights_manager->manage(weights); } - // Configure gemmlowp output - if(_is_quantized) - { - _gemmlowp_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); - } - - // Configure accumulate biases kernel for non quantized asymmetric types - if(biases != nullptr && !_is_quantized) - { - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); - - _accumulate_biases = true; - - // Configure accumulate biases kernel - _accumulate_biases_kernel.set_target(CLScheduler::get().target()); - _accumulate_biases_kernel.configure(output, biases); - } - const ICLTensor *weights_to_use = weights; // With the Fully Connected layer we can have 4 different cases: @@ -244,32 +286,15 @@ void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *w _are_weights_converted = false; } - // Configure fc core - ICLTensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output; if(_is_fc_after_conv) { // Fully Connected layer after a Convolution Layer without batches - configure_conv_fc(input, weights_to_use, tmp_output, fc_info.retain_internal_weights); + configure_conv_fc(input, weights_to_use, biases, output, fc_info.retain_internal_weights); } else { // Fully Connected layer after a Fully Connected Layer without batches - configure_fc_fc(input, weights_to_use, tmp_output, fc_info.retain_internal_weights); - } - - // Configure output stage for asymmetric quantized types - if(_is_quantized) - { - const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform(); - const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform(); - - float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; - int output_multiplier; - int output_shift; - quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); - _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, oq_info.offset); - _gemmlowp_output.allocator()->allocate(); + configure_fc_fc(input, weights_to_use, biases, output, fc_info.retain_internal_weights); } } @@ -281,22 +306,12 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); - bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; - bool is_fc_after_conv = true; - bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); - const GPUTarget gpu_target = CLScheduler::get().target(); + bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + bool is_fc_after_conv = true; const ITensorInfo &flatten_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)).set_data_layout(DataLayout::NCHW)); const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone()); - const ITensorInfo &gemmlowp_output = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); - - // Configure accumulate biases kernel for non quantized asymmetric types - if(biases != nullptr && !is_quantized) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target)); - } // With the Fully Connected layer we can have 4 different cases: // 1) Convolution layer -> Fully Connected layer without batches @@ -306,7 +321,6 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn const ITensorInfo *input_to_use = input; const ITensorInfo *weights_to_use = weights; - const ITensorInfo *tmp_output = (is_quantized) ? &gemmlowp_output : output; // Check if we have a fully connected layer with batches const bool is_batched_fc_layer = output->dimension(1) > 1; @@ -352,21 +366,9 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn // Fully Connected layer after a Fully Connected Layer without batches ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); } - // Validate matrix multiply kernel - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output)); - // Validate output stage for asymmetric quantized types - if(is_quantized) - { - const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); - const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = output->quantization_info().uniform(); - const float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; - - ARM_COMPUTE_UNUSED(multiplier); - ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&gemmlowp_output, biases, output)); - } + // Validate matrix multiply kernel + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, biases, *output)); return Status{}; } @@ -392,19 +394,6 @@ void CLFullyConnectedLayer::run() { _mm_gemm.run(); } - - // Accumulate biases if provided - if(_is_quantized) - { - _gemmlowp_output_stage.run(); - } - else - { - if(_accumulate_biases) - { - CLScheduler::get().enqueue(_accumulate_biases_kernel); - } - } } void CLFullyConnectedLayer::prepare() -- cgit v1.2.1