aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2017-11-29 11:06:49 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:41:58 +0000
commit45bcc3a1c287a208098ae99288273a5129ddd5eb (patch)
treef4f957dbc76f8e8e9a4871b16652e1033bcd4c73 /src/runtime/CL/functions/CLFullyConnectedLayer.cpp
parent303be90ee1f03f75309b421297ba16428ea98ea5 (diff)
downloadComputeLibrary-45bcc3a1c287a208098ae99288273a5129ddd5eb.tar.gz
COMPMID-661: QASYMM8 support for fully connected layer.
Change-Id: I70e04d3a175ba366432ada98e9ca893c9f81b260 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111094 Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/runtime/CL/functions/CLFullyConnectedLayer.cpp')
-rw-r--r--src/runtime/CL/functions/CLFullyConnectedLayer.cpp97
1 files changed, 70 insertions, 27 deletions
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 72d374e9c2..88aaf1cae8 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "support/ToolchainSupport.h"
@@ -40,70 +41,87 @@ void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLT
}
CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _reshape_weights_output(),
- _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false)
+ : _memory_group(memory_manager), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _accumulate_biases_kernel(), _im2col_output(),
+ _gemmlowp_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false)
{
}
-void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, const GPUTarget gpu_target)
+void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool is_interleaved_transposed)
{
- ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+ if(_is_quantized)
+ {
+ // Extract and negate input and weights offset
+ QuantizationInfo input_quantization_info = input->info()->quantization_info();
+ QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
+ input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
+ weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+ // Configure gemmlowp function
+ _mm_gemmlowp.configure(input, weights, output);
+ }
+ else
+ {
+ // Configure matrix multiply kernel
+ _mm_kernel.set_target(CLScheduler::get().target());
+ _mm_kernel.configure(input, weights, output, 1.f, is_interleaved_transposed);
+ }
+}
- const DataType dt = input->info()->data_type();
- const int fixed_point_position = input->info()->fixed_point_position();
+void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
// If the fully connected layer is called after a convolution layer, the input tensor must be linearized
// Initialize output tensor for im2col
- TensorShape shape_im2col;
- shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
- shape_im2col.set(1, input->info()->dimension(3));
- shape_im2col.set(2, input->info()->dimension(4));
- shape_im2col.set(3, input->info()->dimension(5));
- _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+ TensorShape shape_im2col = input->info()->tensor_shape();
+ shape_im2col.collapse(3);
+ _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
// Configure im2col kernel
_memory_group.manage(&_im2col_output);
_im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
// Configure matrix multiply kernel
- _mm_kernel.set_target(gpu_target);
- _mm_kernel.configure(&_im2col_output, weights, output, 1.0f, false);
+ configure_mm(&_im2col_output, weights, output, false);
// Allocate the output tensor for im2col once all the configure methods have been called
_im2col_output.allocator()->allocate();
}
-void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, const GPUTarget gpu_target)
+void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
{
ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
// Configure matrix multiply kernel
- _mm_kernel.set_target(gpu_target);
- _mm_kernel.configure(input, weights, output, 1.0f, false);
+ configure_mm(input, weights, output, false);
}
void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2);
_are_weights_reshaped = transpose_weights ? are_weights_reshaped : true;
_is_fc_after_conv = true;
_accumulate_biases = false;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- // Get GPU target
- const GPUTarget gpu_target = CLScheduler::get().target();
+ // Configure gemmlowp output
+ if(_is_quantized)
+ {
+ _gemmlowp_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+ }
- if(biases != nullptr)
+ // Configure accumulate biases kernel for non quantized asymmetric types
+ if(biases != nullptr && !_is_quantized)
{
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
_accumulate_biases = true;
// Configure accumulate biases kernel
- _accumulate_biases_kernel.set_target(gpu_target);
+ _accumulate_biases_kernel.set_target(CLScheduler::get().target());
_accumulate_biases_kernel.configure(output, biases);
}
@@ -137,15 +155,26 @@ void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *w
_is_fc_after_conv = input->info()->num_dimensions() > 1;
}
+ ICLTensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output;
if(_is_fc_after_conv)
{
// Fully Connected layer after a Convolution Layer without batches
- configure_conv_fc(input, weights_to_use, output, gpu_target);
+ configure_conv_fc(input, weights_to_use, tmp_output);
}
else
{
// Fully Connected layer after a Fully Connected Layer without batches
- configure_fc_fc(input, weights_to_use, output, gpu_target);
+ configure_fc_fc(input, weights_to_use, tmp_output);
+ }
+
+ // Configure output stage for asymmetric quantized types
+ if(_is_quantized)
+ {
+ float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output->info()->quantization_info().scale;
+ int output_multiplier, output_shift;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output->info()->quantization_info().offset, output_multiplier, output_shift);
+ _gemmlowp_output.allocator()->allocate();
}
// Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
@@ -174,12 +203,26 @@ void CLFullyConnectedLayer::run()
}
// Run matrix multiply
- CLScheduler::get().enqueue(_mm_kernel, !_accumulate_biases);
+ if(_is_quantized)
+ {
+ _mm_gemmlowp.run();
+ }
+ else
+ {
+ CLScheduler::get().enqueue(_mm_kernel, !_accumulate_biases);
+ }
// Accumulate biases if provided
- if(_accumulate_biases)
+ if(_is_quantized)
+ {
+ _gemmlowp_output_stage.run();
+ }
+ else
{
- CLScheduler::get().enqueue(_accumulate_biases_kernel);
+ if(_accumulate_biases)
+ {
+ CLScheduler::get().enqueue(_accumulate_biases_kernel);
+ }
}
_memory_group.release();