From 529b5a2355ce6354af3ea9f97af810a94908e7fe Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Tue, 27 Jul 2021 15:55:30 +0100 Subject: Port ClFullyConnected to new API Resolves: COMPMID-4391 Signed-off-by: Georgios Pinitas Change-Id: Idcd5e22ed6e901c7f4c7530e5547ea6a7814ae59 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6025 Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- Android.bp | 1 + .../runtime/CL/functions/CLFullyConnectedLayer.h | 124 +----- filelist.json | 14 + src/runtime/CL/functions/CLFullyConnectedLayer.cpp | 446 ++---------------- src/runtime/gpu/cl/operators/ClFullyConnected.cpp | 496 +++++++++++++++++++++ src/runtime/gpu/cl/operators/ClFullyConnected.h | 138 ++++++ .../cl/operators/ClGemmLowpMatrixMultiplyCore.h | 6 +- 7 files changed, 698 insertions(+), 527 deletions(-) create mode 100644 src/runtime/gpu/cl/operators/ClFullyConnected.cpp create mode 100644 src/runtime/gpu/cl/operators/ClFullyConnected.h diff --git a/Android.bp b/Android.bp index 554b1d1bb7..dd8ea63d70 100644 --- a/Android.bp +++ b/Android.bp @@ -673,6 +673,7 @@ cc_library_static { "src/runtime/gpu/cl/operators/ClFill.cpp", "src/runtime/gpu/cl/operators/ClFlatten.cpp", "src/runtime/gpu/cl/operators/ClFloor.cpp", + "src/runtime/gpu/cl/operators/ClFullyConnected.cpp", "src/runtime/gpu/cl/operators/ClGemm.cpp", "src/runtime/gpu/cl/operators/ClGemmConvolution.cpp", "src/runtime/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp", diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h index 82d1621341..9235a85d2c 100644 --- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h +++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h @@ -24,76 +24,14 @@ #ifndef ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H #define ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +#include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h" -#include "arm_compute/runtime/CL/functions/CLFlattenLayer.h" -#include "arm_compute/runtime/CL/functions/CLGEMM.h" -#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" -#include "arm_compute/runtime/CL/functions/CLTranspose.h" #include "arm_compute/runtime/IWeightsManager.h" #include "arm_compute/runtime/MemoryGroup.h" namespace arm_compute { -namespace weights_transformations -{ -/** Basic function to manage the reshape weights generated from @ref CLTranspose */ -class CLFullyConnectedLayerReshapeWeightsManaged : public ITransformWeights -{ -public: - //Inherited method override - void run() override - { - _output.allocator()->allocate(); - _func.run(); - _reshape_run = true; - } - - //Inherited method override - void release() override - { - _output.allocator()->free(); - } - - //Inherited method override - ICLTensor *get_weights() override - { - return &_output; - } - - //Inherited method override - uint32_t uid() override - { - return _uid; - } - - /** Configures the @ref CLTranspose function - * - * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - */ - void configure(const ICLTensor *input) - { - configure(CLKernelLibrary::get().get_compile_context(), input); - } - /** Configures the @ref CLTranspose function - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input) - { - _func.configure(compile_context, input, &_output); - } - -private: - static constexpr uint32_t _uid = 0x0; - CLTensor _output{}; - CLTranspose _func{}; -}; -} // namespace weights_transformations - /** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels: * * -# @ref opencl::kernels::ClIm2ColKernel (called when the input comes from a convolutional layer) @@ -107,6 +45,8 @@ class CLFullyConnectedLayer : public IFunction public: /** Constructor */ CLFullyConnectedLayer(std::shared_ptr memory_manager = nullptr, IWeightsManager *weights_manager = nullptr); + /** Default destructor */ + ~CLFullyConnectedLayer(); /** Prevent instances of this class from being copied (As this class contains pointers) */ CLFullyConnectedLayer(const CLFullyConnectedLayer &) = delete; /** Default move constructor */ @@ -128,22 +68,6 @@ public: * |F32 |F32 |F32 |F32 | * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | - * - * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] weights Weights tensor. The weights must be 2 dimensional. - * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions. - * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension. - * Data type supported: Same as @p input. - * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input. - * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix multiplication between: - * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer - * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer. - * Data type supported: Same as @p input. - * @param[in] fc_info (Optional) Fully connected layer additional info - */ - void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, - FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); - /** Set the input and output tensors. * * @param[in] compile_context The compile context to be used. * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. @@ -160,19 +84,15 @@ public: */ void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + /** Set the input and output tensors. + * + * Similar to @ref CLFullyConnectedLayer + */ + void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLFullyConnectedLayer * - * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] weights Weights tensor info. The weights must be 2 dimensional. - * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions. - * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension. - * Data type supported: Same as @p input. - * @param[in] biases Bias tensor info. Can be nullptr. Data type supported:Same as @p input. - * @param[out] output Destination tensor info. Its shape should be equal to the output of a matrix multiplication between: - * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer - * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer. - * Data type supported: Same as @p input. - * @param[in] fc_info (Optional) Fully connected layer additional info + * Similar to @ref CLFullyConnectedLayer * * @return a status */ @@ -184,28 +104,8 @@ public: void prepare() override; private: - void configure_fc_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info); - void configure_conv_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info); - void configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info); - - MemoryGroup _memory_group; - IWeightsManager *_weights_manager; - CLConvertFullyConnectedWeights _convert_weights; - weights_transformations::CLConvertFullyConnectedWeightsManaged _convert_weights_managed; - weights_transformations::CLFullyConnectedLayerReshapeWeightsManaged _reshape_weights_managed_function; - CLFlattenLayer _flatten_layer; - CLTranspose _reshape_weights_function; - CLGEMM _mm_gemm; - CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp; - CLTensor _flatten_output; - CLTensor _converted_weights_output; - CLTensor _reshape_weights_output; - bool _are_weights_converted; - bool _are_weights_reshaped; - bool _is_fc_after_conv; - bool _is_quantized; - bool _is_prepared; - const ICLTensor *_original_weights; + struct Impl; + std::unique_ptr _impl; }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H */ diff --git a/filelist.json b/filelist.json index 29db91018f..73ef4c73f5 100644 --- a/filelist.json +++ b/filelist.json @@ -133,6 +133,20 @@ ] } }, + "FullyConnected": { + "deps": [ + "ClFlatten", + "ClConvertFullyConnectedWeights", + "ClGemm", + "ClGemmLowpMatrixMultiplyCore", + "ClTranspose" + ], + "files": { + "operator": [ + "src/runtime/gpu/cl/operators/ClFullyConnected.cpp" + ] + } + }, "ConvertFullyConnectedWeights": { "files": { "operator": [ diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index 0647a473e2..ae10cd23b1 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -23,194 +23,39 @@ */ #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h" -#include "arm_compute/core/Size2D.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" -#include "src/core/gpu/cl/kernels/ClTransposeKernel.h" -#include "support/Cast.h" - -#include +#include "src/core/helpers/MemoryHelpers.h" +#include "src/runtime/gpu/cl/operators/ClFullyConnected.h" namespace arm_compute { -using namespace arm_compute::misc::shape_calculator; -using namespace arm_compute::utils::cast; +using namespace arm_compute::experimental; -namespace -{ -Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output, - GEMMLowpOutputStageInfo &gemmlowp_output_stage, ActivationLayerInfo activation_info) +struct CLFullyConnectedLayer::Impl { - gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - gemmlowp_output_stage.gemmlowp_offset = 0; - gemmlowp_output_stage.gemmlowp_multiplier = 0; - gemmlowp_output_stage.gemmlowp_shift = 0; - - const auto data_type = input.data_type(); - - // Configure output stage for quantized case - if(is_data_type_quantized_asymmetric(data_type)) - { - const QuantizationInfo oq_info = output.quantization_info(); - const UniformQuantizationInfo iq_unif = input.quantization_info().uniform(); - const UniformQuantizationInfo wq_unif = weights.quantization_info().uniform(); - const UniformQuantizationInfo oq_unif = oq_info.uniform(); - - const auto output_quant_info = (output.total_size() == 0) ? iq_unif : oq_unif; - - const float multiplier = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale; - int output_multiplier = 0; - int output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + MemoryGroup memory_group{}; + IWeightsManager *weights_manager{ nullptr }; - PixelValue type_min{}; - PixelValue type_max{}; - std::tie(type_min, type_max) = get_min_max(data_type); + std::unique_ptr op{ nullptr }; - if(activation_info.enabled()) - { - std::tie(type_min, type_max) = get_quantized_activation_min_max(activation_info, data_type, output_quant_info); - } + const ITensor *original_weights{ nullptr }; - // Set the GEMMLowp output stage info - gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; - gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier; - gemmlowp_output_stage.gemmlowp_shift = output_shift; - gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier); - gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift); - type_min.get(gemmlowp_output_stage.gemmlowp_min_bound); - type_max.get(gemmlowp_output_stage.gemmlowp_max_bound); - } - - return Status{}; -} + ITensorPack run_pack{}; + WorkspaceData workspace{}; + experimental::MemoryRequirements aux_mem_req{}; -Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &output, const FullyConnectedLayerInfo &fc_info) -{ - GEMMLowpOutputStageInfo gemmlowp_output_stage; - ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage, fc_info.activation_info)); - - const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped - false, // is_b_reshaped - true, // reshape_b_only_on_first_run - 0, // depth_output_gemm3d - false, // reinterpret_input_as_3d - fc_info.retain_internal_weights, // retain_internal_weights - gemmlowp_output_stage, // gemmlowp_output_stage - fc_info.fp_mixed_precision, // fp_mixed_precision - false, // fast_math - true, // broadcast_bias - ActivationLayerInfo()); // activation_info - - if(is_data_type_quantized_asymmetric(input.data_type())) - { - const UniformQuantizationInfo iq_info = input.quantization_info().uniform(); - const UniformQuantizationInfo wq_info = weights.quantization_info().uniform(); - - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo input_quantization_info(iq_info.scale, -iq_info.offset); - const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset); - - // Validate gemmlowp function - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input.clone()->set_quantization_info(input_quantization_info), - &weights.clone()->set_quantization_info(weights_quantization_info), - bias, - &output, - gemm_info)); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info)); - } - - return Status{}; -} -} // namespace + bool is_prepared{ false }; +}; CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr memory_manager, IWeightsManager *weights_manager) - : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(), _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(), _reshape_weights_function(), - _mm_gemm(memory_manager, weights_manager), _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(), _reshape_weights_output(), _are_weights_converted(true), - _are_weights_reshaped(true), _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr) -{ -} - -void CLFullyConnectedLayer::configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, - const FullyConnectedLayerInfo &fc_info) -{ - GEMMLowpOutputStageInfo gemmlowp_output_stage; - construct_gemmlowp_output_stage(*input->info(), *weights->info(), *output->info(), gemmlowp_output_stage, fc_info.activation_info); - - const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped - false, // is_b_reshaped - true, // reshape_b_only_on_first_run - 0, // depth_output_gemm3d - false, // reinterpret_input_as_3d - fc_info.retain_internal_weights, // retain_internal_weights - gemmlowp_output_stage, // gemmlowp_output_stage - fc_info.fp_mixed_precision, // fp_mixed_precision - false, // fast_math - true, // broadcast_bias - fc_info.activation_info, // activation_info - fc_info.constant_weights); // constant_weights - - if(_is_quantized) - { - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo input_quantization_info = input->info()->quantization_info(); - const QuantizationInfo weights_quantization_info = weights->info()->quantization_info(); - - input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); - weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); - - // Configure gemmlowp function - _mm_gemmlowp.configure(compile_context, input, weights, bias, output, gemm_info); - - // Revert back QuantizatioInfo as input and weights could be used in other fully connected layers - input->info()->set_quantization_info(input_quantization_info); - weights->info()->set_quantization_info(weights_quantization_info); - } - else - { - // Configure matrix multiply kernel - _mm_gemm.configure(compile_context, input, weights, bias, output, 1.f, 1.f, gemm_info); - } -} - -void CLFullyConnectedLayer::configure_conv_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, - const FullyConnectedLayerInfo &fc_info) + : _impl(std::make_unique()) { - ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); - - // If the fully connected layer is called after a convolution layer, the input tensor must be linearized - - // Initialize output tensor for flatten - TensorShape shape_flatten = compute_flatten_shape(input->info()); - _flatten_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten).set_data_layout(DataLayout::NCHW)); - - // Configure flatten kernel - _memory_group.manage(&_flatten_output); - _flatten_layer.configure(compile_context, input, &_flatten_output); - - // Configure matrix multiply kernel - configure_mm(compile_context, &_flatten_output, weights, bias, output, fc_info); - - // Allocate the output tensor for flatten once all the configure methods have been called - _flatten_output.allocator()->allocate(); + _impl->memory_group = MemoryGroup(std::move(memory_manager)); + _impl->weights_manager = weights_manager; } -void CLFullyConnectedLayer::configure_fc_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, - const FullyConnectedLayerInfo &fc_info) -{ - ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); - - // Configure matrix multiply kernel - configure_mm(compile_context, input, weights, bias, output, fc_info); -} +CLFullyConnectedLayer::~CLFullyConnectedLayer() = default; void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, FullyConnectedLayerInfo fc_info) @@ -221,271 +66,48 @@ void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *w void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, FullyConnectedLayerInfo fc_info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), fc_info)); - _are_weights_converted = true; - _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; - _is_fc_after_conv = true; - _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); - _is_prepared = fc_info.retain_internal_weights; - _original_weights = weights; + _impl->op = std::make_unique(); + _impl->original_weights = weights; + _impl->is_prepared = false; - if(_weights_manager) - { - _weights_manager->manage(weights); - } - - const ICLTensor *weights_to_use = weights; - - // With the Fully Connected layer we can have 4 different cases: - // 1) Convolution layer -> Fully Connected layer without batches - // 2) Fully Connected layer -> Fully Connected layer without batches - // 3) Convolution layer -> Fully Connected layer with batches - // 4) Fully Connected layer -> Fully Connected layer with batches + _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info); - // Check if we have a fully connected layer with batches - const bool is_batched_fc_layer = output->info()->dimension(1) > 1; - if(is_batched_fc_layer) - { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3, - input->info()->tensor_shape().cend(), - output->info()->tensor_shape().cbegin() + 1)); - } - else - { - _is_fc_after_conv = input->info()->num_dimensions() > 1; - } - - // Reshape weights if needed - if(!_are_weights_reshaped) - { - if(_weights_manager && _weights_manager->are_weights_managed(weights)) - { - _reshape_weights_managed_function.configure(compile_context, weights); - weights_to_use = utils::cast::polymorphic_downcast(_weights_manager->acquire(weights, &_reshape_weights_managed_function)); - } - else - { - // Reshape the weights - _reshape_weights_function.configure(compile_context, weights, &_reshape_weights_output); - weights_to_use = &_reshape_weights_output; - } - } - - // Convert weights if needed - if(_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout)) - { - if(_weights_manager && _weights_manager->are_weights_managed(weights_to_use)) - { - _convert_weights_managed.configure(compile_context, weights_to_use, - input->info()->tensor_shape(), - fc_info.weights_trained_layout); - weights_to_use = utils::cast::polymorphic_downcast(_weights_manager->acquire(weights, &_convert_weights_managed)); - } - else - { - // Convert weights - _convert_weights.configure(compile_context, weights_to_use, - &_converted_weights_output, - input->info()->tensor_shape(), - fc_info.weights_trained_layout); - - weights_to_use = &_converted_weights_output; - } - _are_weights_converted = false; - } - - if(_is_fc_after_conv) - { - // Fully Connected layer after a Convolution Layer without batches - configure_conv_fc(compile_context, input, weights_to_use, biases, output, fc_info); - } - else - { - // Fully Connected layer after a Fully Connected Layer without batches - configure_fc_fc(compile_context, input, weights_to_use, biases, output, fc_info); - } + _impl->aux_mem_req = _impl->op->workspace(); + _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } }; + _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); } Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, FullyConnectedLayerInfo fc_info) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(input->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU - && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); - ARM_COMPUTE_RETURN_ERROR_ON(!fc_info.constant_weights && (!fc_info.are_weights_reshaped || fc_info.transpose_weights)); - - bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; - bool is_fc_after_conv = true; - - const ITensorInfo &flatten_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)).set_data_layout(DataLayout::NCHW)); - const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); - const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone()); - - // With the Fully Connected layer we can have 4 different cases: - // 1) Convolution layer -> Fully Connected layer without batches - // 2) Fully Connected layer -> Fully Connected layer without batches - // 3) Convolution layer -> Fully Connected layer with batches - // 4) Fully Connected layer -> Fully Connected layer with batches - - const ITensorInfo *input_to_use = input; - const ITensorInfo *weights_to_use = weights; - - // Check if we have a fully connected layer with batches - const bool is_batched_fc_layer = output->dimension(1) > 1; - if(is_batched_fc_layer) - { - is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->tensor_shape().cbegin() + 3, - input->tensor_shape().cend(), - output->tensor_shape().cbegin() + 1)); - } - else - { - is_fc_after_conv = input->num_dimensions() > 1; - } - - if(!weights_reshaped) - { - // Validate reshape weights kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(weights, &reshaped_weights)); - weights_to_use = &reshaped_weights; - } - - if(is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout)) - { - // Validate convert weights kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLConvertFullyConnectedWeights::validate(weights_to_use, - &converted_weights, - input->tensor_shape(), - fc_info.weights_trained_layout)); - weights_to_use = &converted_weights; - } - - if(is_fc_after_conv) - { - // Fully Connected layer after a Convolution Layer without batches - ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2)))); - - // Validate flatten kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &flatten_input)); - input_to_use = &flatten_input; - } - else - { - // Fully Connected layer after a Fully Connected Layer without batches - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); - } - - // Validate matrix multiply kernel - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info)); - - return Status{}; + return opencl::ClFullyConnected::validate(input, weights, biases, output, fc_info); } void CLFullyConnectedLayer::run() { prepare(); - MemoryGroupResourceScope scope_mg(_memory_group); - - // Linearize input if it comes from a convolutional layer - if(_is_fc_after_conv) - { - _flatten_layer.run(); - } - - // Run matrix multiply - if(_is_quantized) - { - _mm_gemmlowp.run(); - } - else - { - _mm_gemm.run(); - } + MemoryGroupResourceScope scope_mg(_impl->memory_group); + _impl->op->run(_impl->run_pack); } void CLFullyConnectedLayer::prepare() { - if(!_is_prepared) + if(!_impl->is_prepared) { - if(!_weights_manager) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - } - - auto release_unused = [](CLTensor * w) - { - if(!w->is_used()) - { - CLScheduler::get().queue().finish(); - w->allocator()->free(); - } - }; - - // Pointer to current weights - const ICLTensor *cur_weights = _original_weights; - - // Reshape of the weights if needed (happens only once) - if(!_are_weights_reshaped) - { - if(_weights_manager && _weights_manager->are_weights_managed(_original_weights)) - { - cur_weights = utils::cast::polymorphic_downcast(_weights_manager->run(cur_weights, &_reshape_weights_managed_function)); - } - else - { - // Run reshape weights kernel and mark weights as unused - _reshape_weights_output.allocator()->allocate(); - _reshape_weights_function.run(); - - cur_weights->mark_as_unused(); - cur_weights = &_reshape_weights_output; - } - _are_weights_reshaped = true; - } - - // Convert weights if needed (happens only once) - if(!_are_weights_converted) - { - if(_weights_manager && _weights_manager->are_weights_managed(cur_weights)) - { - _weights_manager->run(cur_weights, &_convert_weights_managed); - } - else - { - _converted_weights_output.allocator()->allocate(); - _convert_weights.run(); - cur_weights->mark_as_unused(); - } - - _are_weights_converted = true; - } - - // Release reshaped weights if unused - release_unused(&_reshape_weights_output); - - // Prepare GEMM prepare and release unused weights - if(!_is_quantized) - { - _mm_gemm.prepare(); - } - - // Release converted weights if unused - release_unused(&_reshape_weights_output); - release_unused(&_converted_weights_output); + _impl->op->prepare(_impl->run_pack); - _is_prepared = true; + // Release temporary tensors that are only used in prepare stage + release_temporaries(_impl->aux_mem_req, _impl->workspace); + _impl->is_prepared = true; } } } // namespace arm_compute diff --git a/src/runtime/gpu/cl/operators/ClFullyConnected.cpp b/src/runtime/gpu/cl/operators/ClFullyConnected.cpp new file mode 100644 index 0000000000..377168d864 --- /dev/null +++ b/src/runtime/gpu/cl/operators/ClFullyConnected.cpp @@ -0,0 +1,496 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/gpu/cl/operators/ClFullyConnected.h" + +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" + +#include "src/core/helpers/MemoryHelpers.h" +#include "src/runtime/gpu/cl/operators/ClConvertFullyConnectedWeights.h" +#include "src/runtime/gpu/cl/operators/ClFlatten.h" +#include "src/runtime/gpu/cl/operators/ClGemm.h" +#include "src/runtime/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h" +#include "src/runtime/gpu/cl/operators/ClTranspose.h" +#include "src/runtime/gpu/cl/utils/ClAuxTensorHandler.h" + +#include "support/Cast.h" + +#include + +namespace arm_compute +{ +namespace opencl +{ +using namespace arm_compute::experimental; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo &dst, + GEMMLowpOutputStageInfo &gemmlowp_output_stage, ActivationLayerInfo activation_info) +{ + gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + gemmlowp_output_stage.gemmlowp_offset = 0; + gemmlowp_output_stage.gemmlowp_multiplier = 0; + gemmlowp_output_stage.gemmlowp_shift = 0; + + const auto data_type = src.data_type(); + + // Configure output stage for quantized case + if(is_data_type_quantized_asymmetric(data_type)) + { + const QuantizationInfo oq_info = dst.quantization_info(); + const UniformQuantizationInfo iq_unif = src.quantization_info().uniform(); + const UniformQuantizationInfo wq_unif = weights.quantization_info().uniform(); + const UniformQuantizationInfo oq_unif = oq_info.uniform(); + + const auto output_quant_info = (dst.total_size() == 0) ? iq_unif : oq_unif; + + const float multiplier = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale; + int output_multiplier = 0; + int output_shift = 0; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + + PixelValue type_min{}; + PixelValue type_max{}; + std::tie(type_min, type_max) = get_min_max(data_type); + + if(activation_info.enabled()) + { + std::tie(type_min, type_max) = get_quantized_activation_min_max(activation_info, data_type, output_quant_info); + } + + // Set the GEMMLowp output stage info + gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; + gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier; + gemmlowp_output_stage.gemmlowp_shift = output_shift; + gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier); + gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift); + type_min.get(gemmlowp_output_stage.gemmlowp_min_bound); + type_max.get(gemmlowp_output_stage.gemmlowp_max_bound); + } + + return Status{}; +} + +Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &dst, const FullyConnectedLayerInfo &fc_info) +{ + GEMMLowpOutputStageInfo gemmlowp_output_stage; + ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info)); + + const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped + false, // is_b_reshaped + true, // reshape_b_only_on_first_run + 0, // depth_output_gemm3d + false, // reinterpret_input_as_3d + fc_info.retain_internal_weights, // retain_internal_weights + gemmlowp_output_stage, // gemmlowp_output_stage + fc_info.fp_mixed_precision, // fp_mixed_precision + false, // fast_math + true, // broadcast_bias + ActivationLayerInfo()); // activation_info + + if(is_data_type_quantized_asymmetric(src.data_type())) + { + const UniformQuantizationInfo iq_info = src.quantization_info().uniform(); + const UniformQuantizationInfo wq_info = weights.quantization_info().uniform(); + + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate src and weights offset + const QuantizationInfo src_quantization_info(iq_info.scale, -iq_info.offset); + const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset); + + // Validate gemmlowp function + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate(&src.clone()->set_quantization_info(src_quantization_info), + &weights.clone()->set_quantization_info(weights_quantization_info), + bias, + &dst, + gemm_info)); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR(ClGemm::validate(&src, &weights, bias, &dst, 1.f, 1.f, gemm_info)); + } + + return Status{}; +} +} // namespace + +ClFullyConnected::ClFullyConnected() + : _convert_weights(nullptr), + _flatten(nullptr), + _reshape_weights(nullptr), + _mm_gemm(nullptr), + _mm_gemmlowp(nullptr), + _aux_mem(Count) +{ +} + +ClFullyConnected::~ClFullyConnected() = default; + +void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, + const FullyConnectedLayerInfo &fc_info) +{ + GEMMLowpOutputStageInfo gemmlowp_output_stage; + construct_gemmlowp_output_stage(*src, *weights, *dst, gemmlowp_output_stage, fc_info.activation_info); + + const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped + false, // is_b_reshaped + true, // reshape_b_only_on_first_run + 0, // depth_output_gemm3d + false, // reinterpret_input_as_3d + fc_info.retain_internal_weights, // retain_internal_weights + gemmlowp_output_stage, // gemmlowp_output_stage + fc_info.fp_mixed_precision, // fp_mixed_precision + false, // fast_math + true, // broadcast_bias + fc_info.activation_info, // activation_info + fc_info.constant_weights); // constant_weights + + if(_is_quantized) + { + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo src_quantization_info = src->quantization_info(); + const QuantizationInfo weights_quantization_info = weights->quantization_info(); + + TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info); + TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info); + + src_info.set_quantization_info(QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset)); + weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + + // Configure gemmlowp function + _mm_gemmlowp = std::make_unique(); + _mm_gemmlowp->configure(compile_context, &src_info, &weights_info, bias, dst, gemm_info); + } + else + { + // Configure matrix multiply kernel + _mm_gemm = std::make_unique(); + _mm_gemm->configure(compile_context, src, weights, bias, dst, 1.f, 1.f, gemm_info); + } +} + +void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, + const FullyConnectedLayerInfo &fc_info) +{ + ARM_COMPUTE_ERROR_ON((weights->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); + + // If the fully connected layer is called after a convolution layer, the input tensor must be linearized + + // Initialize output tensor for flatten + _flattened_src = src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW); + + // Configure flatten kernel + _flatten = std::make_unique(); + _flatten->configure(compile_context, src, &_flattened_src); + + // Configure matrix multiply kernel + configure_mm(compile_context, &_flattened_src, weights, bias, dst, fc_info); +} + +void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, + const FullyConnectedLayerInfo &fc_info) +{ + ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension(1)); + + // Configure matrix multiply kernel + configure_mm(compile_context, src, weights, bias, dst, fc_info); +} + +void ClFullyConnected::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(ClFullyConnected::validate(src, weights, biases, dst, fc_info)); + + _are_weights_converted = true; + _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + _is_fc_after_conv = true; + _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); + _is_prepared = fc_info.retain_internal_weights; + _weights_to_use = TensorInfo(*weights); + _weights_to_use_idx = ACL_SRC_1; + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = dst->dimension(1) > 1; + if(is_batched_fc_layer) + { + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, + src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); + } + else + { + _is_fc_after_conv = src->num_dimensions() > 1; + } + + ITensorInfo *weights_used = weights; + + // Reshape weights if needed + if(!_are_weights_reshaped) + { + // Reshape the weights + _reshape_weights = std::make_unique(); + _reshape_weights->configure(compile_context, weights, &_reshaped_weights); + weights_used = &_reshaped_weights; + _weights_to_use_idx = offset_int_vec(TransposedWeights); + } + + // Convert weights if needed + if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + { + // Convert weights + _convert_weights = std::make_unique(); + _convert_weights->configure(compile_context, + weights_used, + &_converted_weights, + src->tensor_shape(), + fc_info.weights_trained_layout); + + weights_used = &_converted_weights; + _weights_to_use_idx = offset_int_vec(ConvertedWeights); + _are_weights_converted = false; + } + + if(_is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + configure_conv_fc(compile_context, src, weights_used, biases, dst, fc_info); + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + configure_fc_fc(compile_context, src, weights_used, biases, dst, fc_info); + } + // Update TensorInfo of final weights used (Need to be done in the end due to padding expansion) + _weights_to_use = *weights_used; + + // Set auxiliary memory requirements + auto gemm_mem_req = (_is_quantized) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace(); + for(unsigned int i = 0; i < gemm_mem_req.size(); ++i) + { + _aux_mem[i] = gemm_mem_req[i]; + } + if(_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs + { + // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch + _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), MemoryLifetime::Prepare, _reshaped_weights.total_size()); + _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Prepare, _converted_weights.total_size()); + } + else + { + // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch + const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare; + const auto converted_wei_lft = (_weights_to_use_idx == offset_int_vec(ConvertedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare; + + _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), transposed_wei_lft, _reshaped_weights.total_size()); + _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), converted_wei_lft, _converted_weights.total_size()); + } + _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size()); +} + +Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU + && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); + ARM_COMPUTE_RETURN_ERROR_ON(!fc_info.constant_weights && (!fc_info.are_weights_reshaped || fc_info.transpose_weights)); + + bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + bool is_fc_after_conv = true; + + const ITensorInfo &flatten_src = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW)); + const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); + const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone()); + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *src_to_use = src; + const ITensorInfo *weights_to_use = weights; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = dst->dimension(1) > 1; + if(is_batched_fc_layer) + { + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, + src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); + } + else + { + is_fc_after_conv = src->num_dimensions() > 1; + } + + if(!weights_reshaped) + { + // Validate reshape weights kernel + ARM_COMPUTE_RETURN_ON_ERROR(ClTranspose::validate(weights, &reshaped_weights)); + weights_to_use = &reshaped_weights; + } + + if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + { + // Validate convert weights kernel + ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate(weights_to_use, + &converted_weights, + src->tensor_shape(), + fc_info.weights_trained_layout)); + weights_to_use = &converted_weights; + } + + if(is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); + + // Validate flatten kernel + ARM_COMPUTE_RETURN_ON_ERROR(ClFlatten::validate(src, &flatten_src)); + src_to_use = &flatten_src; + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(1)); + } + + // Validate matrix multiply kernel + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*src_to_use, *weights_to_use, biases, *dst, fc_info)); + + return Status{}; +} + +void ClFullyConnected::run(ITensorPack &tensors) +{ + prepare(tensors); + + auto src = tensors.get_const_tensor(ACL_SRC_0); + + CLAuxTensorHandler flattened_src(offset_int_vec(FlattenedSrc), _flattened_src, tensors, false); + CLAuxTensorHandler weights(_weights_to_use_idx, _weights_to_use, tensors, false); + + // Linearize input if it comes from a convolutional layer + if(_is_fc_after_conv) + { + ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } }; + _flatten->run(flatten_pack); + } + + ITensorPack gemm_pack = tensors; + gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src); + if(_weights_to_use_idx != ACL_SRC_1) + { + gemm_pack.add_const_tensor(ACL_SRC_1, weights.get()); + } + + // Run matrix multiply + if(_is_quantized) + { + _mm_gemmlowp->run(gemm_pack); + } + else + { + _mm_gemm->run(gemm_pack); + } +} + +void ClFullyConnected::prepare(ITensorPack &tensors) +{ + if(!_is_prepared) + { + auto weights = tensors.get_const_tensor(ACL_SRC_1); + + CLAuxTensorHandler reshaped_weights(offset_int_vec(TransposedWeights), _reshaped_weights, tensors, false); + CLAuxTensorHandler converted_weights(offset_int_vec(ConvertedWeights), _converted_weights, tensors, false); + + // Pointer to current weights + const ITensor *cur_weights = weights; + + // Reshape of the weights if needed (happens only once) + if(!_are_weights_reshaped) + { + // Run reshape weights kernel and mark weights as unused + ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } }; + _reshape_weights->run(transpose_pack); + + cur_weights->mark_as_unused(); + cur_weights = reshaped_weights.get(); + + _are_weights_reshaped = true; + } + + // Convert weights if needed (happens only once) + if(!_are_weights_converted) + { + ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } }; + _convert_weights->run(convert_pack); + + cur_weights->mark_as_unused(); + cur_weights = converted_weights.get(); + + _are_weights_converted = true; + } + + tensors.add_const_tensor(ACL_SRC_1, cur_weights); + + // Prepare GEMM prepare and release unused weights + if(!_is_quantized) + { + _mm_gemm->prepare(tensors); + } + else + { + _mm_gemmlowp->prepare(tensors); + } + _is_prepared = true; + } +} + +experimental::MemoryRequirements ClFullyConnected::workspace() const +{ + return _aux_mem; +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/runtime/gpu/cl/operators/ClFullyConnected.h b/src/runtime/gpu/cl/operators/ClFullyConnected.h new file mode 100644 index 0000000000..86f95756d5 --- /dev/null +++ b/src/runtime/gpu/cl/operators/ClFullyConnected.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_FULLY_CONNECTED_H +#define ARM_COMPUTE_CL_FULLY_CONNECTED_H + +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/runtime/gpu/cl/IClOperator.h" + +#include + +namespace arm_compute +{ +namespace opencl +{ +// Forward declarations +class ClConvertFullyConnectedWeights; +class ClFlatten; +class ClGemm; +class ClGemmLowpMatrixMultiplyCore; +class ClTranspose; + +/** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels: + * + * -# @ref opencl::kernels::ClIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref CLTranspose (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once) + * -# @ref opencl::kernels::ClGemmMatrixMultiplyKernel or @ref CLGEMMLowpMatrixMultiplyCore (if quantized asymmetric) + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class ClFullyConnected : public IClOperator +{ +public: + ClFullyConnected(); + ~ClFullyConnected(); + /** Set the input and output tensors. + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension. + * Data type supported: Same as @p src. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p src. + * @param[out] dst Destination tensor. Its shape should be equal to the output of a matrix multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer. + * Data type supported: Same as @p src. + * @param[in] fc_info (Optional) Fully connected layer additional info + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClFullyConnected::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + + // Inherited methods overriden + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + void configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info); + void configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info); + void configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info); + +private: + enum AuxTensorIdx + { + TransposedWeights = 10, + ConvertedWeights = 11, + FlattenedSrc = 12, + Count = 13 + }; + + std::unique_ptr _convert_weights; + std::unique_ptr _flatten; + std::unique_ptr _reshape_weights; + std::unique_ptr _mm_gemm; + std::unique_ptr _mm_gemmlowp; + + experimental::MemoryRequirements _aux_mem{}; + + TensorInfo _flattened_src{}; + TensorInfo _converted_weights{}; + TensorInfo _reshaped_weights{}; + + TensorInfo _weights_to_use{}; + int _weights_to_use_idx{ ACL_SRC_1 }; + + bool _are_weights_converted{ true }; + bool _are_weights_reshaped{ true }; + bool _is_fc_after_conv{ true }; + bool _is_quantized{ false }; + bool _is_prepared{ false }; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_FULLY_CONNECTED_H */ diff --git a/src/runtime/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h b/src/runtime/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h index 941c169118..36a4257b86 100644 --- a/src/runtime/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h +++ b/src/runtime/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h @@ -108,11 +108,11 @@ public: private: enum AuxTensorIdx { - VecSumCol = 0, - VecSumRow, + ResultS32 = 0, RhsQAsymm8, RhsReshape, - ResultS32, + VecSumCol, + VecSumRow, Multipliers, Shifts, Count -- cgit v1.2.1