From 45bcc3a1c287a208098ae99288273a5129ddd5eb Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Wed, 29 Nov 2017 11:06:49 +0000 Subject: COMPMID-661: QASYMM8 support for fully connected layer. Change-Id: I70e04d3a175ba366432ada98e9ca893c9f81b260 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111094 Reviewed-by: Gian Marco Iodice Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com Reviewed-by: Anthony Barbier --- src/runtime/CL/functions/CLFullyConnectedLayer.cpp | 97 ++++++++++++++++------ 1 file changed, 70 insertions(+), 27 deletions(-) (limited to 'src/runtime/CL/functions') diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index 72d374e9c2..88aaf1cae8 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Size2D.h" #include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "support/ToolchainSupport.h" @@ -40,70 +41,87 @@ void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLT } CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _reshape_weights_output(), - _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false) + : _memory_group(memory_manager), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _accumulate_biases_kernel(), _im2col_output(), + _gemmlowp_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false) { } -void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, const GPUTarget gpu_target) +void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool is_interleaved_transposed) { - ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); + if(_is_quantized) + { + // Extract and negate input and weights offset + QuantizationInfo input_quantization_info = input->info()->quantization_info(); + QuantizationInfo weights_quantization_info = weights->info()->quantization_info(); + input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset)); + weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset)); + // Configure gemmlowp function + _mm_gemmlowp.configure(input, weights, output); + } + else + { + // Configure matrix multiply kernel + _mm_kernel.set_target(CLScheduler::get().target()); + _mm_kernel.configure(input, weights, output, 1.f, is_interleaved_transposed); + } +} - const DataType dt = input->info()->data_type(); - const int fixed_point_position = input->info()->fixed_point_position(); +void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); // If the fully connected layer is called after a convolution layer, the input tensor must be linearized // Initialize output tensor for im2col - TensorShape shape_im2col; - shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)); - shape_im2col.set(1, input->info()->dimension(3)); - shape_im2col.set(2, input->info()->dimension(4)); - shape_im2col.set(3, input->info()->dimension(5)); - _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position)); + TensorShape shape_im2col = input->info()->tensor_shape(); + shape_im2col.collapse(3); + _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col)); // Configure im2col kernel _memory_group.manage(&_im2col_output); _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false); // Configure matrix multiply kernel - _mm_kernel.set_target(gpu_target); - _mm_kernel.configure(&_im2col_output, weights, output, 1.0f, false); + configure_mm(&_im2col_output, weights, output, false); // Allocate the output tensor for im2col once all the configure methods have been called _im2col_output.allocator()->allocate(); } -void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, const GPUTarget gpu_target) +void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output) { ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); // Configure matrix multiply kernel - _mm_kernel.set_target(gpu_target); - _mm_kernel.configure(input, weights, output, 1.0f, false); + configure_mm(input, weights, output, false); } void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2); _are_weights_reshaped = transpose_weights ? are_weights_reshaped : true; _is_fc_after_conv = true; _accumulate_biases = false; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); - // Get GPU target - const GPUTarget gpu_target = CLScheduler::get().target(); + // Configure gemmlowp output + if(_is_quantized) + { + _gemmlowp_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + } - if(biases != nullptr) + // Configure accumulate biases kernel for non quantized asymmetric types + if(biases != nullptr && !_is_quantized) { ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); _accumulate_biases = true; // Configure accumulate biases kernel - _accumulate_biases_kernel.set_target(gpu_target); + _accumulate_biases_kernel.set_target(CLScheduler::get().target()); _accumulate_biases_kernel.configure(output, biases); } @@ -137,15 +155,26 @@ void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *w _is_fc_after_conv = input->info()->num_dimensions() > 1; } + ICLTensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output; if(_is_fc_after_conv) { // Fully Connected layer after a Convolution Layer without batches - configure_conv_fc(input, weights_to_use, output, gpu_target); + configure_conv_fc(input, weights_to_use, tmp_output); } else { // Fully Connected layer after a Fully Connected Layer without batches - configure_fc_fc(input, weights_to_use, output, gpu_target); + configure_fc_fc(input, weights_to_use, tmp_output); + } + + // Configure output stage for asymmetric quantized types + if(_is_quantized) + { + float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output->info()->quantization_info().scale; + int output_multiplier, output_shift; + quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); + _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output->info()->quantization_info().offset, output_multiplier, output_shift); + _gemmlowp_output.allocator()->allocate(); } // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called @@ -174,12 +203,26 @@ void CLFullyConnectedLayer::run() } // Run matrix multiply - CLScheduler::get().enqueue(_mm_kernel, !_accumulate_biases); + if(_is_quantized) + { + _mm_gemmlowp.run(); + } + else + { + CLScheduler::get().enqueue(_mm_kernel, !_accumulate_biases); + } // Accumulate biases if provided - if(_accumulate_biases) + if(_is_quantized) + { + _gemmlowp_output_stage.run(); + } + else { - CLScheduler::get().enqueue(_accumulate_biases_kernel); + if(_accumulate_biases) + { + CLScheduler::get().enqueue(_accumulate_biases_kernel); + } } _memory_group.release(); -- cgit v1.2.1